From 378d0fc7f4ff5df5e8e77617c37bcef2b26ddf02 Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Tue, 11 May 2021 17:00:26 +0300 Subject: Rename file_utils to fs_helpers Generally avoid naming things with a "utils" prefix/ suffix since it encourages contributors to dump any new functions there; and over time, as the code grows, things get messy... --- gn3/api/gemma.py | 4 +- gn3/api/general.py | 2 +- gn3/computations/gemma.py | 2 +- gn3/file_utils.py | 98 ------------------------------------------- gn3/fs_helpers.py | 98 +++++++++++++++++++++++++++++++++++++++++++ tests/unit/test_file_utils.py | 20 ++++----- 6 files changed, 112 insertions(+), 112 deletions(-) delete mode 100644 gn3/file_utils.py create mode 100644 gn3/fs_helpers.py diff --git a/gn3/api/gemma.py b/gn3/api/gemma.py index 81e185d..6b0b20e 100644 --- a/gn3/api/gemma.py +++ b/gn3/api/gemma.py @@ -9,8 +9,8 @@ from flask import request from gn3.commands import queue_cmd from gn3.commands import run_cmd -from gn3.file_utils import cache_ipfs_file -from gn3.file_utils import jsonfile_to_dict +from gn3.fs_helpers import cache_ipfs_file +from gn3.fs_helpers import jsonfile_to_dict from gn3.computations.gemma import generate_gemma_cmd from gn3.computations.gemma import do_paths_exist diff --git a/gn3/api/general.py b/gn3/api/general.py index 38e6154..a9a8da2 100644 --- a/gn3/api/general.py +++ b/gn3/api/general.py @@ -5,7 +5,7 @@ from flask import current_app from flask import jsonify from flask import request -from gn3.file_utils import extract_uploaded_file +from gn3.fs_helpers import extract_uploaded_file general = Blueprint("general", __name__) diff --git a/gn3/computations/gemma.py b/gn3/computations/gemma.py index 5f9d5a3..0b22d3c 100644 --- a/gn3/computations/gemma.py +++ b/gn3/computations/gemma.py @@ -7,7 +7,7 @@ from typing import Dict from typing import List from typing import ValuesView from gn3.commands import compose_gemma_cmd -from gn3.file_utils import get_hash_of_files +from gn3.fs_helpers import get_hash_of_files def generate_hash_of_string(unhashed_str: str) -> str: diff --git a/gn3/file_utils.py b/gn3/file_utils.py deleted file mode 100644 index 73f6567..0000000 --- a/gn3/file_utils.py +++ /dev/null @@ -1,98 +0,0 @@ -"""Procedures that operate on files/ directories""" -import hashlib -import json -import os -import random -import string -import tarfile -import pathlib - -from functools import partial -from typing import Dict -from typing import List -from werkzeug.utils import secure_filename - -import ipfshttpclient - - -def get_hash_of_files(files: List[str]) -> str: - """Given a list of valid of FILES, return their hash as a string""" - md5hash = hashlib.md5() - for file_path in sorted(files): - if not os.path.exists(file_path): - raise FileNotFoundError - with open(file_path, "rb") as file_: - for buf in iter(partial(file_.read, 4096), b''): - md5hash.update(bytearray( - hashlib.md5(buf).hexdigest(), "utf-8")) - return md5hash.hexdigest() - - -def get_dir_hash(directory: str) -> str: - """Return the hash of a DIRECTORY""" - if not os.path.exists(directory): - raise FileNotFoundError - all_files = [ - os.path.join(root, names) for root, _, files in os.walk(directory) - for names in sorted(files) - ] - return get_hash_of_files(all_files) - - -def jsonfile_to_dict(json_file: str) -> Dict: - """Give a JSON_FILE, return a python dict""" - with open(json_file) as _file: - data = json.load(_file) - return data - raise FileNotFoundError - - -def generate_random_n_string(n_length: int) -> str: - """Generate a random string that is N chars long""" - return ''.join( - random.choice(string.ascii_uppercase + string.digits) - for _ in range(n_length)) - - -def extract_uploaded_file(gzipped_file, - target_dir: str, - token: str = "") -> Dict: - """Get the (directory) hash of extracted contents of GZIPPED_FILE; and move -contents to TARGET_DIR/. - - """ - if not token: - token = (f"{generate_random_n_string(6)}-" - f"{generate_random_n_string(6)}") - tar_target_loc = os.path.join(target_dir, token, - secure_filename(gzipped_file.filename)) - try: - if not os.path.exists(os.path.join(target_dir, token)): - os.mkdir(os.path.join(target_dir, token)) - gzipped_file.save(tar_target_loc) - # Extract to "tar_target_loc/token" - tar = tarfile.open(tar_target_loc) - tar.extractall(path=os.path.join(target_dir, token)) - tar.close() - # pylint: disable=W0703 - except Exception: - return {"status": 128, "error": "gzip failed to unpack file"} - return {"status": 0, "token": token} - - -def cache_ipfs_file(ipfs_file: str, - cache_dir: str, - ipfs_addr: str = "/ip4/127.0.0.1/tcp/5001") -> str: - """Check if a file exists in cache; if it doesn't, cache it. Return the - cached file location - - """ - file_loc = os.path.join(cache_dir, ipfs_file.split("ipfs/")[-1]) - if not os.path.exists(file_loc): - client = ipfshttpclient.connect(ipfs_addr) - client.get(ipfs_file, - target=str( - pathlib.Path - (os.path.join(cache_dir, - ipfs_file.split("ipfs/")[-1])).parent)) - return file_loc diff --git a/gn3/fs_helpers.py b/gn3/fs_helpers.py new file mode 100644 index 0000000..73f6567 --- /dev/null +++ b/gn3/fs_helpers.py @@ -0,0 +1,98 @@ +"""Procedures that operate on files/ directories""" +import hashlib +import json +import os +import random +import string +import tarfile +import pathlib + +from functools import partial +from typing import Dict +from typing import List +from werkzeug.utils import secure_filename + +import ipfshttpclient + + +def get_hash_of_files(files: List[str]) -> str: + """Given a list of valid of FILES, return their hash as a string""" + md5hash = hashlib.md5() + for file_path in sorted(files): + if not os.path.exists(file_path): + raise FileNotFoundError + with open(file_path, "rb") as file_: + for buf in iter(partial(file_.read, 4096), b''): + md5hash.update(bytearray( + hashlib.md5(buf).hexdigest(), "utf-8")) + return md5hash.hexdigest() + + +def get_dir_hash(directory: str) -> str: + """Return the hash of a DIRECTORY""" + if not os.path.exists(directory): + raise FileNotFoundError + all_files = [ + os.path.join(root, names) for root, _, files in os.walk(directory) + for names in sorted(files) + ] + return get_hash_of_files(all_files) + + +def jsonfile_to_dict(json_file: str) -> Dict: + """Give a JSON_FILE, return a python dict""" + with open(json_file) as _file: + data = json.load(_file) + return data + raise FileNotFoundError + + +def generate_random_n_string(n_length: int) -> str: + """Generate a random string that is N chars long""" + return ''.join( + random.choice(string.ascii_uppercase + string.digits) + for _ in range(n_length)) + + +def extract_uploaded_file(gzipped_file, + target_dir: str, + token: str = "") -> Dict: + """Get the (directory) hash of extracted contents of GZIPPED_FILE; and move +contents to TARGET_DIR/. + + """ + if not token: + token = (f"{generate_random_n_string(6)}-" + f"{generate_random_n_string(6)}") + tar_target_loc = os.path.join(target_dir, token, + secure_filename(gzipped_file.filename)) + try: + if not os.path.exists(os.path.join(target_dir, token)): + os.mkdir(os.path.join(target_dir, token)) + gzipped_file.save(tar_target_loc) + # Extract to "tar_target_loc/token" + tar = tarfile.open(tar_target_loc) + tar.extractall(path=os.path.join(target_dir, token)) + tar.close() + # pylint: disable=W0703 + except Exception: + return {"status": 128, "error": "gzip failed to unpack file"} + return {"status": 0, "token": token} + + +def cache_ipfs_file(ipfs_file: str, + cache_dir: str, + ipfs_addr: str = "/ip4/127.0.0.1/tcp/5001") -> str: + """Check if a file exists in cache; if it doesn't, cache it. Return the + cached file location + + """ + file_loc = os.path.join(cache_dir, ipfs_file.split("ipfs/")[-1]) + if not os.path.exists(file_loc): + client = ipfshttpclient.connect(ipfs_addr) + client.get(ipfs_file, + target=str( + pathlib.Path + (os.path.join(cache_dir, + ipfs_file.split("ipfs/")[-1])).parent)) + return file_loc diff --git a/tests/unit/test_file_utils.py b/tests/unit/test_file_utils.py index cc842d5..75be4f6 100644 --- a/tests/unit/test_file_utils.py +++ b/tests/unit/test_file_utils.py @@ -1,14 +1,14 @@ -"""Test cases for procedures defined in file_utils.py""" +"""Test cases for procedures defined in fs_helpers.py""" import os import unittest from dataclasses import dataclass from typing import Callable from unittest import mock -from gn3.file_utils import extract_uploaded_file -from gn3.file_utils import get_dir_hash -from gn3.file_utils import jsonfile_to_dict -from gn3.file_utils import cache_ipfs_file +from gn3.fs_helpers import extract_uploaded_file +from gn3.fs_helpers import get_dir_hash +from gn3.fs_helpers import jsonfile_to_dict +from gn3.fs_helpers import cache_ipfs_file @dataclass @@ -19,7 +19,7 @@ class MockFile: class TestFileUtils(unittest.TestCase): - """Test cases for procedures defined in file_utils.py""" + """Test cases for procedures defined in fs_helpers.py""" def test_get_dir_hash(self): """Test that a directory is hashed correctly""" @@ -45,8 +45,8 @@ non-existent""" self.assertRaises(FileNotFoundError, jsonfile_to_dict, "/non-existent-dir") - @mock.patch("gn3.file_utils.tarfile") - @mock.patch("gn3.file_utils.secure_filename") + @mock.patch("gn3.fs_helpers.tarfile") + @mock.patch("gn3.fs_helpers.secure_filename") def test_extract_uploaded_file(self, mock_file, mock_tarfile): """Test that the gzip file is extracted to the right location""" mock_file.return_value = "upload-data.tar.gz" @@ -65,7 +65,7 @@ non-existent""" mock_file.assert_called_once_with("upload-data.tar.gz") self.assertEqual(result, {"status": 0, "token": "abcdef-abcdef"}) - @mock.patch("gn3.file_utils.secure_filename") + @mock.patch("gn3.fs_helpers.secure_filename") def test_extract_uploaded_file_non_existent_gzip(self, mock_file): """Test that the right error message is returned when there is a problem extracting the file""" @@ -96,7 +96,7 @@ extracting the file""" os.rmdir(test_dir) self.assertEqual(file_loc, f"{test_dir}/genotype.txt") - @mock.patch("gn3.file_utils.ipfshttpclient") + @mock.patch("gn3.fs_helpers.ipfshttpclient") def test_cache_ipfs_file_cache_miss(self, mock_ipfs): """Test that a file is cached if there's a cache miss""" -- cgit v1.2.3