Source code for fastr.utils.checksum

# Copyright 2011-2014 Biomedical Imaging Group Rotterdam, Departments of
# Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This module contains a number of functions for checksumming files and objects
"""
import hashlib
import os
import pickle
from fastr import exceptions


[docs]def checksum(filepath, algorithm='md5', hasher=None): """ Generate the checksum of a file :param filepath: path of the file(s) to checksum :type filepath: str, list :param str algorithm: the algorithm to use :param _hashlib.HASH hasher: a hasher to continue updating (rather then creating a new one) :return: the checksum :rtype: str """ if isinstance(filepath, str): filepath = [filepath] if hasher is None: hasher = hashlib.new(algorithm) for path in filepath: if os.path.isdir(path): checksum_directory(path, hasher=hasher) else: with open(path, 'rb') as file_handle: while True: data = file_handle.read(32768) if not data: break hasher.update(data) return hasher.hexdigest()
[docs]def checksum_directory(directory, algorithm='md5', hasher=None): """ Generate the checksum of an entire directory :param str directory: path of the file(s) to checksum :param str algorithm: the algorithm to use :param _hashlib.HASH hasher: a hasher to continue updating (rather then creating a new one) :return: the checksum :rtype: str """ if not os.path.isdir(directory): raise exceptions.FastrValueError('{} does not point to a valid directory!') if hasher is None: hasher = hashlib.new(algorithm) for root, dirs, files in os.walk(directory): relroot = os.path.relpath(root, directory) for d in dirs: hasher.update(os.path.join(relroot, d)) for filename in files: hasher.update(os.path.join(relroot, filename)) with open(os.path.join(root, filename)) as file_handle: while True: data = file_handle.read(32768) if not data: break hasher.update(data) return hasher.hexdigest()
[docs]def hashsum(objects, hasher=None): """ Generate the md5 checksum of (a) python object(s) :param objects: the objects to hash :param hasher: the hasher to use as a base :return: the hash generated :rtype: str """ if hasher is None: hasher = hashlib.new('md5') if not isinstance(objects, list): objects = [objects] for obj in objects: if isinstance(obj, (str, unicode)): hasher.update(obj) elif isinstance(obj, (bool, int, float)): hasher.update(str(obj)) elif isinstance(obj, file): obj.seek(0) # Make sure to hash entire file while True: data = obj.read(32768) if not data: break hasher.update(data) elif isinstance(obj, list): hasher.update('_||LIST||_') for element in obj: hashsum(element, hasher=hasher) hasher.update('_|ENDELEM|_') hasher.update('_||ENDLIST||_') elif isinstance(obj, tuple): hasher.update('_||TUPLE||_') for element in obj: hashsum(element, hasher=hasher) hasher.update('_|ENDELEM|_') hasher.update('_||ENDTUPLE||_') elif isinstance(obj, dict): hasher.update('_||DICT||_') for key in sorted(obj.keys()): hashsum(key, hasher=hasher) hasher.update('_|ENDKEY|_') hashsum(obj[key], hasher=hasher) hasher.update('_|ENDVALUE|_') hasher.update('_||ENDDICT||_') elif isinstance(obj, set): hasher.update('_||SET||_') for element in sorted(obj): hashsum(element, hasher=hasher) hasher.update('_|ENDELEM|_') hasher.update('_||ENDSET||_') else: hasher.update(pickle.dumps(obj)) return hasher.hexdigest()
[docs]def md5_checksum(filepath): """ Generate the md5 checksum of a file :param filepath: path of the file(s) to checksum :type filepath: str, list :return: the checksum :rtype: str """ return checksum(filepath)
[docs]def sha1_checksum(filepath): """ Generate the sha1 checksum of a file :param filepath: path of the file(s) to checksum :type filepath: str, list :return: the checksum :rtype: str """ return checksum(filepath, 'sha1')