Source code for openghg.util._download

from pathlib import Path
from rich.progress import wrap_file
from typing import Optional, Union
import logging

logger = logging.getLogger("openghg.util")
logger.setLevel(logging.DEBUG)  # Have to set level for logger as well as handler


[docs] def parse_url_filename(url: str) -> str: """Get the filename from a (messy) URL. Args: url: URL of file Returns: str: Filename """ from urllib.parse import urlparse return Path(urlparse(url).path).name
[docs] def download_data( url: str, filepath: Optional[Union[str, Path]] = None, timeout: int = 10 ) -> Optional[bytes]: """Download data file, with progress bar. Based on https://stackoverflow.com/a/63831344/1303032 Args: url: URL of content to download filepath: Filepath to write out data timeount: Timeout for HTTP request (seconds) Returns: bytes / None: Bytes if no filepath given """ import io import shutil from urllib.parse import urlparse import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry # type: ignore retriable_status_codes = [ requests.codes.internal_server_error, requests.codes.bad_gateway, requests.codes.service_unavailable, requests.codes.gateway_timeout, requests.codes.too_many_requests, requests.codes.request_timeout, ] retry_strategy = Retry( total=3, status_forcelist=retriable_status_codes, allowed_methods=["HEAD", "GET", "OPTIONS"], backoff_factor=1, ) # type: ignore adapter = HTTPAdapter(max_retries=retry_strategy) http = requests.Session() http.mount("https://", adapter) http.mount("http://", adapter) try: r = http.get(url=url, stream=True, allow_redirects=True, timeout=timeout) except ( requests.exceptions.RequestException, requests.exceptions.ConnectionError, ) as e: logger.info(f"Unable to retrieve data from {url}, error: {str(e)}") return None filename = Path(urlparse(url).path).name if r.status_code != 200: logger.info(f"Unable to download {url}, please check URL.") return None file_size = int(r.headers.get("Content-Length", 0)) desc = f"Downloading {filename}" r.raw.decode_content = True # mypy error ignored # rich and requests libraries not quite aligning but urllib3.response.HTTPResponse should be very similiar to BinaryIO object expected. with wrap_file(r.raw, total=file_size, description=desc) as r_raw: # type:ignore with io.BytesIO() as buf: shutil.copyfileobj(r_raw, buf) if filepath is None: return buf.getvalue() else: Path(filepath).write_bytes(buf.getvalue()) return None