aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/numpy/lib/_datasource.py
diff options
context:
space:
mode:
Diffstat (limited to '.venv/lib/python3.12/site-packages/numpy/lib/_datasource.py')
-rw-r--r--.venv/lib/python3.12/site-packages/numpy/lib/_datasource.py704
1 files changed, 704 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/numpy/lib/_datasource.py b/.venv/lib/python3.12/site-packages/numpy/lib/_datasource.py
new file mode 100644
index 00000000..613733fa
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/numpy/lib/_datasource.py
@@ -0,0 +1,704 @@
+"""A file interface for handling local and remote data files.
+
+The goal of datasource is to abstract some of the file system operations
+when dealing with data files so the researcher doesn't have to know all the
+low-level details. Through datasource, a researcher can obtain and use a
+file with one function call, regardless of location of the file.
+
+DataSource is meant to augment standard python libraries, not replace them.
+It should work seamlessly with standard file IO operations and the os
+module.
+
+DataSource files can originate locally or remotely:
+
+- local files : '/home/guido/src/local/data.txt'
+- URLs (http, ftp, ...) : 'http://www.scipy.org/not/real/data.txt'
+
+DataSource files can also be compressed or uncompressed. Currently only
+gzip, bz2 and xz are supported.
+
+Example::
+
+ >>> # Create a DataSource, use os.curdir (default) for local storage.
+ >>> from numpy import DataSource
+ >>> ds = DataSource()
+ >>>
+ >>> # Open a remote file.
+ >>> # DataSource downloads the file, stores it locally in:
+ >>> # './www.google.com/index.html'
+ >>> # opens the file and returns a file object.
+ >>> fp = ds.open('http://www.google.com/') # doctest: +SKIP
+ >>>
+ >>> # Use the file as you normally would
+ >>> fp.read() # doctest: +SKIP
+ >>> fp.close() # doctest: +SKIP
+
+"""
+import os
+import io
+
+from .._utils import set_module
+
+
+_open = open
+
+
+def _check_mode(mode, encoding, newline):
+ """Check mode and that encoding and newline are compatible.
+
+ Parameters
+ ----------
+ mode : str
+ File open mode.
+ encoding : str
+ File encoding.
+ newline : str
+ Newline for text files.
+
+ """
+ if "t" in mode:
+ if "b" in mode:
+ raise ValueError("Invalid mode: %r" % (mode,))
+ else:
+ if encoding is not None:
+ raise ValueError("Argument 'encoding' not supported in binary mode")
+ if newline is not None:
+ raise ValueError("Argument 'newline' not supported in binary mode")
+
+
+# Using a class instead of a module-level dictionary
+# to reduce the initial 'import numpy' overhead by
+# deferring the import of lzma, bz2 and gzip until needed
+
+# TODO: .zip support, .tar support?
+class _FileOpeners:
+ """
+ Container for different methods to open (un-)compressed files.
+
+ `_FileOpeners` contains a dictionary that holds one method for each
+ supported file format. Attribute lookup is implemented in such a way
+ that an instance of `_FileOpeners` itself can be indexed with the keys
+ of that dictionary. Currently uncompressed files as well as files
+ compressed with ``gzip``, ``bz2`` or ``xz`` compression are supported.
+
+ Notes
+ -----
+ `_file_openers`, an instance of `_FileOpeners`, is made available for
+ use in the `_datasource` module.
+
+ Examples
+ --------
+ >>> import gzip
+ >>> np.lib._datasource._file_openers.keys()
+ [None, '.bz2', '.gz', '.xz', '.lzma']
+ >>> np.lib._datasource._file_openers['.gz'] is gzip.open
+ True
+
+ """
+
+ def __init__(self):
+ self._loaded = False
+ self._file_openers = {None: io.open}
+
+ def _load(self):
+ if self._loaded:
+ return
+
+ try:
+ import bz2
+ self._file_openers[".bz2"] = bz2.open
+ except ImportError:
+ pass
+
+ try:
+ import gzip
+ self._file_openers[".gz"] = gzip.open
+ except ImportError:
+ pass
+
+ try:
+ import lzma
+ self._file_openers[".xz"] = lzma.open
+ self._file_openers[".lzma"] = lzma.open
+ except (ImportError, AttributeError):
+ # There are incompatible backports of lzma that do not have the
+ # lzma.open attribute, so catch that as well as ImportError.
+ pass
+
+ self._loaded = True
+
+ def keys(self):
+ """
+ Return the keys of currently supported file openers.
+
+ Parameters
+ ----------
+ None
+
+ Returns
+ -------
+ keys : list
+ The keys are None for uncompressed files and the file extension
+ strings (i.e. ``'.gz'``, ``'.xz'``) for supported compression
+ methods.
+
+ """
+ self._load()
+ return list(self._file_openers.keys())
+
+ def __getitem__(self, key):
+ self._load()
+ return self._file_openers[key]
+
+_file_openers = _FileOpeners()
+
+def open(path, mode='r', destpath=os.curdir, encoding=None, newline=None):
+ """
+ Open `path` with `mode` and return the file object.
+
+ If ``path`` is an URL, it will be downloaded, stored in the
+ `DataSource` `destpath` directory and opened from there.
+
+ Parameters
+ ----------
+ path : str
+ Local file path or URL to open.
+ mode : str, optional
+ Mode to open `path`. Mode 'r' for reading, 'w' for writing, 'a' to
+ append. Available modes depend on the type of object specified by
+ path. Default is 'r'.
+ destpath : str, optional
+ Path to the directory where the source file gets downloaded to for
+ use. If `destpath` is None, a temporary directory will be created.
+ The default path is the current directory.
+ encoding : {None, str}, optional
+ Open text file with given encoding. The default encoding will be
+ what `io.open` uses.
+ newline : {None, str}, optional
+ Newline to use when reading text file.
+
+ Returns
+ -------
+ out : file object
+ The opened file.
+
+ Notes
+ -----
+ This is a convenience function that instantiates a `DataSource` and
+ returns the file object from ``DataSource.open(path)``.
+
+ """
+
+ ds = DataSource(destpath)
+ return ds.open(path, mode, encoding=encoding, newline=newline)
+
+
+@set_module('numpy')
+class DataSource:
+ """
+ DataSource(destpath='.')
+
+ A generic data source file (file, http, ftp, ...).
+
+ DataSources can be local files or remote files/URLs. The files may
+ also be compressed or uncompressed. DataSource hides some of the
+ low-level details of downloading the file, allowing you to simply pass
+ in a valid file path (or URL) and obtain a file object.
+
+ Parameters
+ ----------
+ destpath : str or None, optional
+ Path to the directory where the source file gets downloaded to for
+ use. If `destpath` is None, a temporary directory will be created.
+ The default path is the current directory.
+
+ Notes
+ -----
+ URLs require a scheme string (``http://``) to be used, without it they
+ will fail::
+
+ >>> repos = np.DataSource()
+ >>> repos.exists('www.google.com/index.html')
+ False
+ >>> repos.exists('http://www.google.com/index.html')
+ True
+
+ Temporary directories are deleted when the DataSource is deleted.
+
+ Examples
+ --------
+ ::
+
+ >>> ds = np.DataSource('/home/guido')
+ >>> urlname = 'http://www.google.com/'
+ >>> gfile = ds.open('http://www.google.com/')
+ >>> ds.abspath(urlname)
+ '/home/guido/www.google.com/index.html'
+
+ >>> ds = np.DataSource(None) # use with temporary file
+ >>> ds.open('/home/guido/foobar.txt')
+ <open file '/home/guido.foobar.txt', mode 'r' at 0x91d4430>
+ >>> ds.abspath('/home/guido/foobar.txt')
+ '/tmp/.../home/guido/foobar.txt'
+
+ """
+
+ def __init__(self, destpath=os.curdir):
+ """Create a DataSource with a local path at destpath."""
+ if destpath:
+ self._destpath = os.path.abspath(destpath)
+ self._istmpdest = False
+ else:
+ import tempfile # deferring import to improve startup time
+ self._destpath = tempfile.mkdtemp()
+ self._istmpdest = True
+
+ def __del__(self):
+ # Remove temp directories
+ if hasattr(self, '_istmpdest') and self._istmpdest:
+ import shutil
+
+ shutil.rmtree(self._destpath)
+
+ def _iszip(self, filename):
+ """Test if the filename is a zip file by looking at the file extension.
+
+ """
+ fname, ext = os.path.splitext(filename)
+ return ext in _file_openers.keys()
+
+ def _iswritemode(self, mode):
+ """Test if the given mode will open a file for writing."""
+
+ # Currently only used to test the bz2 files.
+ _writemodes = ("w", "+")
+ for c in mode:
+ if c in _writemodes:
+ return True
+ return False
+
+ def _splitzipext(self, filename):
+ """Split zip extension from filename and return filename.
+
+ Returns
+ -------
+ base, zip_ext : {tuple}
+
+ """
+
+ if self._iszip(filename):
+ return os.path.splitext(filename)
+ else:
+ return filename, None
+
+ def _possible_names(self, filename):
+ """Return a tuple containing compressed filename variations."""
+ names = [filename]
+ if not self._iszip(filename):
+ for zipext in _file_openers.keys():
+ if zipext:
+ names.append(filename+zipext)
+ return names
+
+ def _isurl(self, path):
+ """Test if path is a net location. Tests the scheme and netloc."""
+
+ # We do this here to reduce the 'import numpy' initial import time.
+ from urllib.parse import urlparse
+
+ # BUG : URLs require a scheme string ('http://') to be used.
+ # www.google.com will fail.
+ # Should we prepend the scheme for those that don't have it and
+ # test that also? Similar to the way we append .gz and test for
+ # for compressed versions of files.
+
+ scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)
+ return bool(scheme and netloc)
+
+ def _cache(self, path):
+ """Cache the file specified by path.
+
+ Creates a copy of the file in the datasource cache.
+
+ """
+ # We import these here because importing them is slow and
+ # a significant fraction of numpy's total import time.
+ import shutil
+ from urllib.request import urlopen
+
+ upath = self.abspath(path)
+
+ # ensure directory exists
+ if not os.path.exists(os.path.dirname(upath)):
+ os.makedirs(os.path.dirname(upath))
+
+ # TODO: Doesn't handle compressed files!
+ if self._isurl(path):
+ with urlopen(path) as openedurl:
+ with _open(upath, 'wb') as f:
+ shutil.copyfileobj(openedurl, f)
+ else:
+ shutil.copyfile(path, upath)
+ return upath
+
+ def _findfile(self, path):
+ """Searches for ``path`` and returns full path if found.
+
+ If path is an URL, _findfile will cache a local copy and return the
+ path to the cached file. If path is a local file, _findfile will
+ return a path to that local file.
+
+ The search will include possible compressed versions of the file
+ and return the first occurrence found.
+
+ """
+
+ # Build list of possible local file paths
+ if not self._isurl(path):
+ # Valid local paths
+ filelist = self._possible_names(path)
+ # Paths in self._destpath
+ filelist += self._possible_names(self.abspath(path))
+ else:
+ # Cached URLs in self._destpath
+ filelist = self._possible_names(self.abspath(path))
+ # Remote URLs
+ filelist = filelist + self._possible_names(path)
+
+ for name in filelist:
+ if self.exists(name):
+ if self._isurl(name):
+ name = self._cache(name)
+ return name
+ return None
+
+ def abspath(self, path):
+ """
+ Return absolute path of file in the DataSource directory.
+
+ If `path` is an URL, then `abspath` will return either the location
+ the file exists locally or the location it would exist when opened
+ using the `open` method.
+
+ Parameters
+ ----------
+ path : str
+ Can be a local file or a remote URL.
+
+ Returns
+ -------
+ out : str
+ Complete path, including the `DataSource` destination directory.
+
+ Notes
+ -----
+ The functionality is based on `os.path.abspath`.
+
+ """
+ # We do this here to reduce the 'import numpy' initial import time.
+ from urllib.parse import urlparse
+
+ # TODO: This should be more robust. Handles case where path includes
+ # the destpath, but not other sub-paths. Failing case:
+ # path = /home/guido/datafile.txt
+ # destpath = /home/alex/
+ # upath = self.abspath(path)
+ # upath == '/home/alex/home/guido/datafile.txt'
+
+ # handle case where path includes self._destpath
+ splitpath = path.split(self._destpath, 2)
+ if len(splitpath) > 1:
+ path = splitpath[1]
+ scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)
+ netloc = self._sanitize_relative_path(netloc)
+ upath = self._sanitize_relative_path(upath)
+ return os.path.join(self._destpath, netloc, upath)
+
+ def _sanitize_relative_path(self, path):
+ """Return a sanitised relative path for which
+ os.path.abspath(os.path.join(base, path)).startswith(base)
+ """
+ last = None
+ path = os.path.normpath(path)
+ while path != last:
+ last = path
+ # Note: os.path.join treats '/' as os.sep on Windows
+ path = path.lstrip(os.sep).lstrip('/')
+ path = path.lstrip(os.pardir).lstrip('..')
+ drive, path = os.path.splitdrive(path) # for Windows
+ return path
+
+ def exists(self, path):
+ """
+ Test if path exists.
+
+ Test if `path` exists as (and in this order):
+
+ - a local file.
+ - a remote URL that has been downloaded and stored locally in the
+ `DataSource` directory.
+ - a remote URL that has not been downloaded, but is valid and
+ accessible.
+
+ Parameters
+ ----------
+ path : str
+ Can be a local file or a remote URL.
+
+ Returns
+ -------
+ out : bool
+ True if `path` exists.
+
+ Notes
+ -----
+ When `path` is an URL, `exists` will return True if it's either
+ stored locally in the `DataSource` directory, or is a valid remote
+ URL. `DataSource` does not discriminate between the two, the file
+ is accessible if it exists in either location.
+
+ """
+
+ # First test for local path
+ if os.path.exists(path):
+ return True
+
+ # We import this here because importing urllib is slow and
+ # a significant fraction of numpy's total import time.
+ from urllib.request import urlopen
+ from urllib.error import URLError
+
+ # Test cached url
+ upath = self.abspath(path)
+ if os.path.exists(upath):
+ return True
+
+ # Test remote url
+ if self._isurl(path):
+ try:
+ netfile = urlopen(path)
+ netfile.close()
+ del(netfile)
+ return True
+ except URLError:
+ return False
+ return False
+
+ def open(self, path, mode='r', encoding=None, newline=None):
+ """
+ Open and return file-like object.
+
+ If `path` is an URL, it will be downloaded, stored in the
+ `DataSource` directory and opened from there.
+
+ Parameters
+ ----------
+ path : str
+ Local file path or URL to open.
+ mode : {'r', 'w', 'a'}, optional
+ Mode to open `path`. Mode 'r' for reading, 'w' for writing,
+ 'a' to append. Available modes depend on the type of object
+ specified by `path`. Default is 'r'.
+ encoding : {None, str}, optional
+ Open text file with given encoding. The default encoding will be
+ what `io.open` uses.
+ newline : {None, str}, optional
+ Newline to use when reading text file.
+
+ Returns
+ -------
+ out : file object
+ File object.
+
+ """
+
+ # TODO: There is no support for opening a file for writing which
+ # doesn't exist yet (creating a file). Should there be?
+
+ # TODO: Add a ``subdir`` parameter for specifying the subdirectory
+ # used to store URLs in self._destpath.
+
+ if self._isurl(path) and self._iswritemode(mode):
+ raise ValueError("URLs are not writeable")
+
+ # NOTE: _findfile will fail on a new file opened for writing.
+ found = self._findfile(path)
+ if found:
+ _fname, ext = self._splitzipext(found)
+ if ext == 'bz2':
+ mode.replace("+", "")
+ return _file_openers[ext](found, mode=mode,
+ encoding=encoding, newline=newline)
+ else:
+ raise FileNotFoundError(f"{path} not found.")
+
+
+class Repository (DataSource):
+ """
+ Repository(baseurl, destpath='.')
+
+ A data repository where multiple DataSource's share a base
+ URL/directory.
+
+ `Repository` extends `DataSource` by prepending a base URL (or
+ directory) to all the files it handles. Use `Repository` when you will
+ be working with multiple files from one base URL. Initialize
+ `Repository` with the base URL, then refer to each file by its filename
+ only.
+
+ Parameters
+ ----------
+ baseurl : str
+ Path to the local directory or remote location that contains the
+ data files.
+ destpath : str or None, optional
+ Path to the directory where the source file gets downloaded to for
+ use. If `destpath` is None, a temporary directory will be created.
+ The default path is the current directory.
+
+ Examples
+ --------
+ To analyze all files in the repository, do something like this
+ (note: this is not self-contained code)::
+
+ >>> repos = np.lib._datasource.Repository('/home/user/data/dir/')
+ >>> for filename in filelist:
+ ... fp = repos.open(filename)
+ ... fp.analyze()
+ ... fp.close()
+
+ Similarly you could use a URL for a repository::
+
+ >>> repos = np.lib._datasource.Repository('http://www.xyz.edu/data')
+
+ """
+
+ def __init__(self, baseurl, destpath=os.curdir):
+ """Create a Repository with a shared url or directory of baseurl."""
+ DataSource.__init__(self, destpath=destpath)
+ self._baseurl = baseurl
+
+ def __del__(self):
+ DataSource.__del__(self)
+
+ def _fullpath(self, path):
+ """Return complete path for path. Prepends baseurl if necessary."""
+ splitpath = path.split(self._baseurl, 2)
+ if len(splitpath) == 1:
+ result = os.path.join(self._baseurl, path)
+ else:
+ result = path # path contains baseurl already
+ return result
+
+ def _findfile(self, path):
+ """Extend DataSource method to prepend baseurl to ``path``."""
+ return DataSource._findfile(self, self._fullpath(path))
+
+ def abspath(self, path):
+ """
+ Return absolute path of file in the Repository directory.
+
+ If `path` is an URL, then `abspath` will return either the location
+ the file exists locally or the location it would exist when opened
+ using the `open` method.
+
+ Parameters
+ ----------
+ path : str
+ Can be a local file or a remote URL. This may, but does not
+ have to, include the `baseurl` with which the `Repository` was
+ initialized.
+
+ Returns
+ -------
+ out : str
+ Complete path, including the `DataSource` destination directory.
+
+ """
+ return DataSource.abspath(self, self._fullpath(path))
+
+ def exists(self, path):
+ """
+ Test if path exists prepending Repository base URL to path.
+
+ Test if `path` exists as (and in this order):
+
+ - a local file.
+ - a remote URL that has been downloaded and stored locally in the
+ `DataSource` directory.
+ - a remote URL that has not been downloaded, but is valid and
+ accessible.
+
+ Parameters
+ ----------
+ path : str
+ Can be a local file or a remote URL. This may, but does not
+ have to, include the `baseurl` with which the `Repository` was
+ initialized.
+
+ Returns
+ -------
+ out : bool
+ True if `path` exists.
+
+ Notes
+ -----
+ When `path` is an URL, `exists` will return True if it's either
+ stored locally in the `DataSource` directory, or is a valid remote
+ URL. `DataSource` does not discriminate between the two, the file
+ is accessible if it exists in either location.
+
+ """
+ return DataSource.exists(self, self._fullpath(path))
+
+ def open(self, path, mode='r', encoding=None, newline=None):
+ """
+ Open and return file-like object prepending Repository base URL.
+
+ If `path` is an URL, it will be downloaded, stored in the
+ DataSource directory and opened from there.
+
+ Parameters
+ ----------
+ path : str
+ Local file path or URL to open. This may, but does not have to,
+ include the `baseurl` with which the `Repository` was
+ initialized.
+ mode : {'r', 'w', 'a'}, optional
+ Mode to open `path`. Mode 'r' for reading, 'w' for writing,
+ 'a' to append. Available modes depend on the type of object
+ specified by `path`. Default is 'r'.
+ encoding : {None, str}, optional
+ Open text file with given encoding. The default encoding will be
+ what `io.open` uses.
+ newline : {None, str}, optional
+ Newline to use when reading text file.
+
+ Returns
+ -------
+ out : file object
+ File object.
+
+ """
+ return DataSource.open(self, self._fullpath(path), mode,
+ encoding=encoding, newline=newline)
+
+ def listdir(self):
+ """
+ List files in the source Repository.
+
+ Returns
+ -------
+ files : list of str
+ List of file names (not containing a directory part).
+
+ Notes
+ -----
+ Does not currently work for remote repositories.
+
+ """
+ if self._isurl(self._baseurl):
+ raise NotImplementedError(
+ "Directory listing of URLs, not supported yet.")
+ else:
+ return os.listdir(self._baseurl)