Source code for cmemsapi.cmemsapi

#! /usr/bin/env python3
# -*- coding: utf-8 -*-

"""Main module."""

import calendar
import datetime as dt
import getpass as password
import hashlib
import logging
import math
import os
import re
import shutil
import subprocess
import sys
import time
from functools import reduce
from importlib import reload
from pathlib import Path

import requests as rq
import fire
import lxml.html
import pandas as pd
import xarray as xr
from funcy import omit

[docs]DEFAULT_CURRENT_PATH = os.getcwd()
[docs]BOLD = '\033[1m'
[docs]END = '\033[0m'
[docs]LOGFILE = Path( DEFAULT_CURRENT_PATH, 'log', ''.join(["CMEMS_API_",'%Y%m%d_%H%M'), ".log"]))
try: if not LOGFILE.parent.exists(): LOGFILE.parent.mkdir(parents=True) if os.path.exists(LOGFILE): os.remove(LOGFILE) print(f'[INFO] Logging to: {str(LOGFILE)}') reload(logging) logging.basicConfig(filename=LOGFILE, level=logging.DEBUG, format='[%(asctime)s] - [%(levelname)s] - %(message)s', datefmt='%Y-%m-%d %H:%M:%S') except IOError: print("[ERROR] Failed to set logger.")
[docs]def set_target_directory(local_storage_directory=None): """ Returns working directory where data is saved. Default value (None) creates a directory (``copernicus-tmp-data``) in the current working directory. Parameters ---------- local_storage_directory : path or str, optional A path object or string. The default is None. Returns ------- target_directory : path A path to the directory where data is saved. """ if local_storage_directory: target_directory = Path(local_storage_directory) else: target_directory = Path(DEFAULT_CURRENT_PATH, 'copernicus-tmp-data') if not target_directory.exists(): target_directory.mkdir(parents=True) print(f'[INFO] Directory successfully created : {target_directory}.') return target_directory
[docs]def multireplace(tobereplaced, substitute): """ Returns replaced string given string and substitute map. Parameters ---------- tobereplaced : str String to execute replacements on. substitute : dict Substitute dictionary {value to find: value to replace}. Returns ------- str Replaced string. """ substrings = sorted(substitute, key=len, reverse=True) regex = re.compile('|'.join(map(re.escape, substrings))) return regex.sub(lambda match: substitute[], tobereplaced)
[docs]def query(question, default="yes"): """ Returns answer from a yes/no question, read from user\'s input. Parameters ---------- question : str String written as a question, displayed to user. default : str, optional String value to be presented to user to help . The default is "yes". Raises ------ ValueError Raise error to continue asking question until user inputs one of the valid choice. Returns ------- bool Returns ``True`` if user validates question, ``False`` otherwise. """ valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False} if default is None: prompt = " [y/n] " elif default == "yes": prompt = " [Y/n] " elif default == "no": prompt = " [y/N] " else: raise ValueError(f"[ERROR] Invalid default answer: '{default}'") while True: sys.stdout.write(question + prompt) choice = input().lower() if default is not None and choice == '': return valid[default] elif choice in valid: return valid[choice] else: sys.stdout.write("[ACTION] Please respond with 'yes' or 'no' "
"(or 'y' or 'n').\n")
[docs]def get_config_constraints(): """ Returns constraints configuration as ``dict`` from which data requests will be stacked. Returns ------- split_dict : TYPE DESCRIPTION. """ c_dict = { 'year': { 'depth': 6000, 'geo': 200 }, 'month': { 'depth': 6000, 'geo': 360 }, 'day': { 'depth': 6000, 'geo': 360 } } split_dict = { 'hourly_r': { 'pattern': [ '-hi', 'hourly', 'hts', 'fc-h', '1-027', '1-032', 'rean-h', '1hr', '3dinst', '_hm', 'BLENDED', '15min', 'MetO-NWS-WAV-RAN', 'skin', 'surface' ], 'year_s': c_dict['year'], 'month_s': c_dict['month'], 'day_s': c_dict['day'] }, 'day_r': { 'pattern': ['daily', 'weekly', 'an-fc-d', 'rean-d', 'day-', '-dm-'], 'year_s': c_dict['year'], 'month_s': c_dict['month'], 'day_s': c_dict['day'] }, 'month_r': { 'pattern': [ 'month', 'an-fc-m', 'rean-m', '-mm-', '-MON-', 'ran-arc-myoceanv2-be', 'CORIOLIS', 'bgc3d' ], 'year_s': c_dict['year'], 'month_s': c_dict['month'] } } return split_dict
[docs]def get_credentials(file_rc=None, sep='='): """ Returns Copernicus Marine Credentials. Credentials can be specified in a file or if ommitted, manually by user's input. Parameters ---------- file_rc : str or path, optional Location of the file storing credentials. The default is None. sep : str, optional Character used to separate credential and its value. The default is `=`. Raises ------ SystemExit Raise an error to exit program at fatal error (wrong credentials etc). Returns ------- copernicus_username : str Copernicus Marine username. copernicus_password : str Copernicus Marine password. """ lines = [] if not file_rc: file_rc = Path.cwd() / 'copernicus_credentials.txt' try: with open(file_rc, 'r') as cred: for line in cred: lines.append(line) except FileNotFoundError: print(f'[INFO] Credentials must be entered hereafter, obtained from: ' f'') print( f'[INFO] If you have forgotten either your USERNAME ' f'(which {BOLD}is NOT your email address{END}) or your PASSWORD, ' f'please visit:' ) time.sleep(2) usr = password.getpass( prompt=f"[ACTION] Please input your Copernicus {BOLD}USERNAME{END}" " (and hit `Enter` key):") time.sleep(2) pwd = password.getpass( prompt=f"[ACTION] Please input your Copernicus {BOLD}PASSWORD{END}" " (and hit `Enter` key):") lines.append(f'username{sep}{usr}') lines.append(f'password{sep}{pwd}') create_cred_file = query( f'[ACTION] For future usage, do you want to save credentials in a' ' configuration file?', 'yes') if create_cred_file: with open(file_rc, 'w') as cred: for line in lines: cred.write(''.join([line, '\n'])) if not all([sep in item for item in lines]): print('[ERROR] Sperator is not found. Must be specifed or corrected.\n' f'[WARNING] Please double check content of {file_rc}. ' f'It should match (please mind the `{sep}`):' f'\nusername{sep}<USERNAME>\npassword{sep}<PASSWORD>') raise SystemExit copernicus_username = ''.join(lines[0].strip().split(sep)[1:]) copernicus_password = ''.join(lines[1].strip().split(sep)[1:]) if not check_credentials(copernicus_username, copernicus_password): if file_rc.exists(): msg = f' from content of {file_rc}' else: msg = '' print( '[ERROR] Provided username and/or password could not be validated.\n' f'[WARNING] Please double check it{msg}. More help at: ' '') raise SystemExit print('[INFO] Credentials have been succcessfully loaded and verified.') return copernicus_username, copernicus_password
[docs]def check_credentials(user, pwd): """ Check provided Copernicus Marine Credentials are correct. Parameters ---------- username : str Copernicus Marine Username, provided for free from . password : str Copernicus Marine Password, provided for free from . Returns ------- bool Returns ``True`` if credentials are correst, ``False`` otherwise. """ cmems_cas_url = '' conn_session = rq.session() login_session = conn_session.get(cmems_cas_url) login_from_html = lxml.html.fromstring(login_session.text) hidden_elements_from_html = login_from_html.xpath( '//form//input[@type="hidden"]') playload = { he.attrib['name']: he.attrib['value'] for he in hidden_elements_from_html } playload['username'] = user playload['password'] = pwd, data=playload) if 'CASTGC' not in conn_session.cookies: return False return True
[docs]def get_viewscript(): """ Ask the user to input the ``VIEW_SCRIPT`` command. Returns ------- view_myscript : str String representing the ``TEMPLATE COMMAND`` generated by the webportal. Example is available at """ uni_test = [ 'python -m motuclient --motu http', ' '.join([ '--out-dir <OUTPUT_DIRECTORY> --out-name <OUTPUT_FILENAME>', '--user <USERNAME> --pwd <PASSWORD>' ]) ] while True: view_myscript = input( f"[ACTION] Please paste the template command displayed on the webportal:\n" ) if not all([item in view_myscript for item in uni_test]): print( '[ERROR] Cannot parse VIEWSCRIPT. ' 'Please paste the ``TEMPLATE COMMAND`` as shown in this article: ' '' 'how-to-write-and-run-the-script-to-download-' 'cmems-products-through-subset-or-direct-download-mechanisms/?idpage=169' ) else: return view_myscript
[docs]def viewscript_string_to_dict(viewmy_script): """ Convert the ``VIEW SCRIPT`` string displayed by the webportal to a ``dict``. Parameters ---------- viewmy_script : TYPE DESCRIPTION. Returns ------- vs_dict : TYPE DESCRIPTION. """ vs_dict = dict( [e.strip().partition(" ")[::2] for e in viewmy_script.split('--')]) vs_dict['variable'] = [value for (var, value) in [e.strip().partition(" ")[::2] for e in viewmy_script.split('--')] if var == 'variable'] # pylint: disable=line-too-long vs_dict['abs_geo'] = [ abs(float(vs_dict['longitude-min']) - float(vs_dict['longitude-max'])), abs(float(vs_dict['latitude-min']) - float(vs_dict['latitude-max'])) ] try: vs_dict['abs_depth'] = abs( float(vs_dict['depth-min']) - float(vs_dict['depth-max'])) except KeyError: print(f"[INFO] The {vs_dict['product-id']} is 3D and not 4D:" " it does not contain depth dimension.") if len(vs_dict['date-min']) == 12: dtformat = '%Y-%m-%d' elif len(vs_dict['date-min']) > 12: dtformat = '%Y-%m-%d %H:%M:%S' vs_dict['dt-date-min'] = dt.datetime.strptime(vs_dict['date-min'][1:-1], dtformat) vs_dict['dt-date-max'] = dt.datetime.strptime(vs_dict['date-max'][1:-1], dtformat) if vs_dict['dt-date-max'].day == 1: vs_dict['dt-date-max'] = vs_dict['dt-date-max'] + dt.timedelta(days=1) vs_dict['delta-days'] = vs_dict['dt-date-max'] - vs_dict['dt-date-min'] vs_dict['prefix'] = '_'.join( list((vs_dict['service-id'].split('-')[0]).split('_')[i] for i in [0, -2, -1])) vs_dict['suffix'] = '.nc' if vs_dict['abs_geo'][0] == 0 and vs_dict['abs_geo'][1] == 0: vs_dict['gridpoint'] = 'gridpoint' if '-' in vs_dict['longitude-min']: vs_dict['gridpoint'] = '_'.join([ vs_dict['gridpoint'], vs_dict['longitude-min'].replace(".", "dot").replace("-", "W") ]) else: vs_dict['gridpoint'] = '_'.join([ vs_dict['gridpoint'], ''.join(['E', vs_dict['longitude-min'].replace('.', 'dot')]) ]) if '-' in vs_dict['latitude-min']: vs_dict['gridpoint'] = '_'.join([ vs_dict['gridpoint'], vs_dict['latitude-min'].replace(".", "dot").replace("-", "S") ]) else: vs_dict['gridpoint'] = '_'.join([ vs_dict['gridpoint'], ''.join(['N', vs_dict['latitude-min'].replace('.', 'dot')]) ]) if len(vs_dict['variable']) > 6: vs_dict['out_var_name'] = 'several_vars' else: vs_dict['out_var_name'] = '_'.join(vs_dict['variable']) return vs_dict
[docs]def get_dates_stack(vs_dict, check_stack, size=None, renew=None): """ Update a ``dict`` containing ``VIEW SCRIPT`` values with dates for sub-requests. Parameters ---------- vs_dict : TYPE DESCRIPTION. check_stack : TYPE DESCRIPTION. size : TYPE, optional DESCRIPTION. The default is None. renew : TYPE, optional DESCRIPTION. The default is None. Returns ------- vs_dict : TYPE DESCRIPTION. """ if not size: cmd = 'cmd' else: cmd = 'size' if not renew: date_in = vs_dict['dt-date-min'] else: date_in = renew if check_stack == 'day': vs_dict[f'{cmd}-date-min'] = dt.datetime(date_in.year, date_in.month,, 0) vs_dict[f'{cmd}-date-max'] = dt.datetime(date_in.year, date_in.month,, 23, 30) vs_dict['format'] = "%Y%m%d" elif check_stack == 'month': vs_dict[f'{cmd}-date-min'] = dt.datetime(date_in.year, date_in.month, 1, 0) vs_dict[f'{cmd}-date-max'] = dt.datetime( date_in.year, date_in.month, calendar.monthrange(date_in.year, date_in.month)[1], 23, 30) vs_dict['format'] = "%Y%m" elif check_stack == 'year': if date_in.year == vs_dict['dt-date-max'].year: vs_dict[f'{cmd}-date-max'] = dt.datetime( date_in.year, vs_dict['dt-date-max'].month, calendar.monthrange(date_in.year, vs_dict['dt-date-max'].month)[1], 23, 30) else: vs_dict[f'{cmd}-date-max'] = dt.datetime(date_in.year, 12, 31, 23, 30) vs_dict[f'{cmd}-date-min'] = dt.datetime(date_in.year, date_in.month,, 0) vs_dict['format'] = "%Y" else: print(f'No matching stack queries found for: {check_stack}') return vs_dict
[docs]def viewscript_dict_to_string(size=None, strict=None, cmd=None, **kwargs): """ Convert the ``dict`` containing keys and values of the ``VIEW SCRIPT``, into a string as displayed by the webportal. Parameters ---------- size : TYPE, optional DESCRIPTION. The default is None. strict : TYPE, optional DESCRIPTION. The default is None. cmd : TYPE, optional DESCRIPTION. The default is None. **kwargs : TYPE DESCRIPTION. Returns ------- command : TYPE DESCRIPTION. """ if size: feature = 'size' elif strict: feature = 'dt' elif cmd: feature = 'cmd' vs_string = [] if 'python' in kwargs: vs_string.append(f"python {kwargs['python']}") if 'motu' in kwargs: vs_string.append(f"--motu {kwargs['motu']}") if 'service-id' in kwargs: vs_string.append(f"--service-id {kwargs['service-id']}") if 'product-id' in kwargs: vs_string.append(f"--product-id {kwargs['product-id']}") if 'longitude-min' in kwargs: vs_string.append(f"--longitude-min {kwargs['longitude-min']}") if 'longitude-max' in kwargs: vs_string.append(f"--longitude-max {kwargs['longitude-max']}") if 'latitude-min' in kwargs: vs_string.append(f"--latitude-min {kwargs['latitude-min']}") if 'latitude-max' in kwargs: vs_string.append(f"--latitude-max {kwargs['latitude-max']}") if f'{feature}-date-min' in kwargs: vs_string.append(f"--date-min \"{kwargs[f'{feature}-date-min']}\"") if f'{feature}-date-max' in kwargs: vs_string.append(f"--date-max \"{kwargs[f'{feature}-date-max']}\"") if 'depth-min' in kwargs: vs_string.append(f"--depth-min {kwargs['depth-min']}") if 'depth-max' in kwargs: vs_string.append(f"--depth-max {kwargs['depth-max']}") if 'variable' in kwargs: #if type(kwargs['variable']) == list: if isinstance(kwargs['variable'], list): for var in kwargs['variable']: vs_string.append(f"--variable {var}") # re-written due to pylint #3397 #[vs_string.append(f"--variable {var}") for var in kwargs['variable']] else: vs_string.append(f"--variable {kwargs['variable']}") if 'outname' in kwargs: vs_string.append(f"--out-name {kwargs['outname']}") if 'target_directory' in kwargs: vs_string.append(f"--out-dir {kwargs['target_directory']}") command = ' '.join(vs_string) return command
[docs]def get_data(command=None, user=None, pwd=None, size=None): """ Returns status of binary netCDF file or, if ``size`` is specified, potential result file size, whose units is `kBytes`. Parameters ---------- command : TYPE, optional DESCRIPTION. The default is None. user : TYPE, optional DESCRIPTION. The default is None. pwd : TYPE, optional DESCRIPTION. The default is None. size : TYPE, optional DESCRIPTION. The default is None. Returns ------- returncode : TYPE DESCRIPTION. message : TYPE DESCRIPTION. """ if not user and not pwd: user, pwd = get_credentials() if not command: view_myscript = get_viewscript() command = view_myscript.replace( '--out-dir <OUTPUT_DIRECTORY> --out-name <OUTPUT_FILENAME> ' '--user <USERNAME> --pwd <PASSWORD>', '') msg = '' if size: msg = '--size -o console' get_command = ' '.join([command, msg, '-q -u ', user, ' -p ', pwd]) cmd_rep = get_command.replace(get_command.split(' ')[-1], '****')"SUBMIT REQUEST: %s", cmd_rep) process = subprocess.Popen(get_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) message, _ = process.communicate() returncode = process.returncode return returncode, message
[docs]def check_data(returncode, message, command=None, user=None, stack=None, size=None): """ Returns ``True`` if status of the submitted request is successful, ``False`` otherwise. Parameters ---------- returncode : TYPE DESCRIPTION. message : TYPE DESCRIPTION. command : TYPE, optional DESCRIPTION. The default is None. user : TYPE, optional DESCRIPTION. The default is None. stack : TYPE, optional DESCRIPTION. The default is None. size : TYPE, optional DESCRIPTION. The default is None. Raises ------ SystemExit Raise an error to exit program at fatal error due to server maintenance. Returns ------- valid_check : bool DESCRIPTION. """ valid_check = False if returncode == 0: if b'[ERROR]' in message: logging.error("FAILED REQUEST - raised error:\n %s", message) else: if size: if stack: if b'code="005-0"' in message: valid_check = True elif b'code="005-0"' not in message and b'code="005-7"' in message: # Handling exceptions due to changes in MOTU API from v3.10 to v3.12 try: req_size = convert_size_hr( (float(str(message).split('=')[-1].split('"')[1])) * 1000) except ValueError: req_size = convert_size_hr( (float(str(message).split('=')[4].split('"')[1])) * 1000) treshold_size = convert_size_hr(1.0E8 * 1000) if req_size > treshold_size: token = hashlib.md5( (':'.join([command.rstrip(), user])).encode('utf-8')).hexdigest() token_url = '' # pylint: disable=line-too-long resp = rq.get(f'{token_url}/{token}') if resp.status_code == 200: valid_check = True else: msg = ( '[ERROR] Your datarequest exceeds max limit set to 100 GiB.\n' '[ACTION] Please contact Support Team at:\n' ' \n' # pylint: disable=line-too-long f'[ACTION] And submit a query attaching your logile located here:\n' f' {LOGFILE}.\n' '[INFO] Once it is done and by the next 48 hours, ' 'the Support Team will authorize your request ' 'and send an email to the inbox linked to ' f'the Copernicus Marine Account (username = {user}) ' 'for confirmation and instructions.' ) print(msg) logging.error(msg) else: valid_check = True elif b'code="005-0"' in message: valid_check = True else:'Request status is successful') print( '[INFO] Server is releasing the token to successfully grant next request. ' 'It will resume AUTOMATICALLY.\n') time.sleep(5) valid_check = True else: logging.error("FAILED REQUEST - raised error:\n %s", message) print('[WARNING] Failed data request has been logged.\n') if b'HTTP Error 503' in message: print( 'HTTP Error 503 - Service is temporary down. Break for 5 minutes.' ) time.sleep(300) if b'HTTP Error 4' in message: logging.error('Permanent error. Exiting program.') raise SystemExit return valid_check
[docs]def process_viewscript(target_directory, view_myscript=None, user=None, pwd=None, forcestack=None): """ Generates as many data requests as required to match initial ``VIEW_SCRIPT``. Parameters ---------- target_directory : str or path DESCRIPTION. view_myscript : str, optional DESCRIPTION. The default is None. user : str, optional DESCRIPTION. The default is None. pwd : str, optional DESCRIPTION. The default is None. forcestack : bool, optional DESCRIPTION. The default is None. Raises ------ ValueError DESCRIPTION. Returns ------- TYPE On success, returns path of the output file matching the ``VIEW_SCRIPT`` data request, ``False`` otherwise. """ split_dict = get_config_constraints() outname = False if not user and not pwd: user, pwd = get_credentials() if not view_myscript: view_myscript = get_viewscript() else: uni_test = [ 'python -m motuclient --motu http', ' '.join([ '--out-dir <OUTPUT_DIRECTORY> --out-name <OUTPUT_FILENAME>', '--user <USERNAME> --pwd <PASSWORD>' ]) ] if not all([item in view_myscript for item in uni_test]): msg = ( '[DEBUG] Cannot parse VIEWSCRIPT. ' 'Please paste the ``TEMPLATE COMMAND`` as shown in this article: ' '' 'how-to-write-and-run-the-script-to-download-' 'cmems-products-through-subset-or-direct-download-mechanisms/?idpage=169' ) raise ValueError(msg) view_script_command = view_myscript.replace( '--out-dir <OUTPUT_DIRECTORY> --out-name <OUTPUT_FILENAME> ' '--user <USERNAME> --pwd <PASSWORD>', '') init_returncode, init_message = get_data(view_script_command, user, pwd, size=True) if not check_data( init_returncode, init_message, view_script_command, user, size=True): return outname vs_dict = viewscript_string_to_dict(view_script_command) vs_dict['target_directory'] = str(target_directory) if not forcestack: for key_r, val_r in split_dict.items(): if any(x in vs_dict['product-id'] for x in val_r.get('pattern', 'Not Found')): for key_s, val_s in omit(split_dict[key_r].items(), 'pattern'): try: check = all([ val_s.get('depth') >= vs_dict['abs_depth'], *([g <= val_s.get('geo') for g in vs_dict['abs_geo']]) ]) except KeyError: check = all([ *([g <= val_s.get('geo') for g in vs_dict['abs_geo']]) ]) if check: check_stack = key_s[:-2] if vs_dict['delta-days'].days < 28: check_stack = 'day' vs_dict = get_dates_stack(vs_dict, check_stack, size=True) command_size = viewscript_dict_to_string(size=True, **vs_dict) returncode, message = get_data(command_size, user, pwd, size=True) if check_data(returncode, message, stack=check_stack, size=True): stack = check_stack break else: stack = forcestack try: msg = (f'[INFO] Data requests will be submitted by ' f'{stack} stacks.') except NameError: stack = 'day' msg = ('[WARNING] No matching temporal resolution. ' f'To be coded using CSW. Stack is set to {stack}.') print(msg) print('\n+------------------------------------+\n| ! - CONNECTION TO CMEMS' 'HUB - OPEN |\n+------------------------------------+\n\n') for retry in range(1, 4): retry_flag = False date_start = vs_dict['dt-date-min'] date_end = vs_dict['dt-date-max'] vs_dict = get_dates_stack(vs_dict, stack) while date_start <= date_end: date_end_format = vs_dict['cmd-date-max'].strftime( vs_dict['format']) try: vs_dict['outname'] = '-'.join([ 'CMEMS', vs_dict['prefix'], vs_dict['gridpoint'], vs_dict['out_var_name'], date_end_format + vs_dict['suffix'] ]) except KeyError: vs_dict['outname'] = '-'.join([ 'CMEMS', vs_dict['prefix'], vs_dict['out_var_name'], date_end_format + vs_dict['suffix'] ]) command = viewscript_dict_to_string(cmd=True, **vs_dict) outname = vs_dict['outname'] print( '\n----------------------------------\n' '- ! - Processing dataset request : ' f"{outname}\n----------------------------------\n") if not Path(target_directory / outname).exists(): print('## MOTU API COMMAND ##') print(command.replace(user, '*****').replace(pwd, '*****')) print( '\n[INFO] New data request has been submitted to Copernicus' 'Marine Servers. ' 'If successful, it will extract the data and create your' ' dataset on the fly. Please wait. \n') returncode, message = get_data(command, user, pwd) if check_data(returncode, message): print('[INFO] The dataset for {} has been stored in {}.'. format(outname, target_directory)) else: retry_flag = True else: print(f"[INFO] The dataset for {outname} " f"has already been downloaded in {target_directory}\n") date_start = vs_dict['cmd-date-max'] + dt.timedelta(days=1) vs_dict = get_dates_stack(vs_dict, stack, renew=date_start) if not retry_flag: break print("+-------------------------------------+\n| ! - CONNECTION TO CMEMS " "HUB - CLOSE |\n+-------------------------------------+\n") with open(LOGFILE) as logfile: if retry == 3 and 'ERROR' in print("## YOUR ATTENTION IS REQUIRED ##") print(f'Some download requests failed, though {retry} retries. ' f'Please see recommendation in {LOGFILE})') print('TIPS: you can also apply hereafter recommendations.' '\n1. Do not move netCDF files' '\n2. Double check if a change must be done in the ' 'viewscript, FTR it is currently set to:\n') print(view_myscript) print( '\n3. Check there is not an ongoing maintenance by looking ' 'at the User Notification Service and Systems & Products Status:\n', '' '\n4. Then, if relevant, do relaunch manually this python ' 'script to automatically download only failed data request(s)' '\n5. Finally, feel free to contact our Support Team either:' '\n - By mail: or \n - ' 'By using the webform: ' '' ' or \n - By leaving a post on the forum:' '\n\n') outname = False return outname
[docs]def convert_size_hr(size_in_bytes): """ Get size from bytes and displays to user in human readable. Parameters ---------- size_in_bytes : TYPE DESCRIPTION. Returns ------- TYPE DESCRIPTION. """ if size_in_bytes == 0: return '0 Byte' size_standard = ('B', 'KiB', 'MiB', 'GiB', 'TiB') integer = int(math.floor(math.log(size_in_bytes, 1_024))) powmath = math.pow(1_024, integer) precision = 2 size = round(size_in_bytes / powmath, precision) return size, size_standard[integer]
[docs]def get_disk_stat(drive=None): """ Get disk size statistics. Parameters ---------- drive : TYPE, optional DESCRIPTION. The default is None. Returns ------- disk_stat : TYPE DESCRIPTION. """ if not drive: drive = '/' disk_stat = list(shutil.disk_usage(drive)) return disk_stat
[docs]def get_file_size(files): """ Get size of file(s) in bytes. Parameters ---------- files : TYPE DESCRIPTION. Returns ------- mds_size : TYPE DESCRIPTION. """ mds_size = 0 for file in files: with xr.open_dataset(file, decode_cf=False) as sds: mds_size = mds_size + sds.nbytes return mds_size
[docs]def check_file_size(mds_size, default_nc_size=None): """ Check size of file(s). Parameters ---------- mds_size : TYPE DESCRIPTION. default_nc_size : TYPE, optional DESCRIPTION. The default is None. Returns ------- check_fs : TYPE DESCRIPTION. """ if not default_nc_size: default_nc_size = 16_000_000_000 check_fs = False size, unit = display_disk_stat(mds_size) if mds_size == 0: print(f'[ERROR-NETCDF] There is an error to assess the size of netCDF ' 'file(s). Please check if data are not corrupted.') elif size == 0: print(f'[ERROR] Program exit.') elif mds_size > default_nc_size: print(f'[INFO-NETCDF] The size of the netCDF file would be higher than' ' 16 GiB.') force = query( f'[ACTION-NETCDF] Do you still want to create the netCDF file of ' f'{BOLD}size {size} {unit}{END}?', 'no') if not force: print('[ERROR-NETCDF] Writing to disk action has been aborted by ' 'user due to file size issue.') print('[INFO-NETCDF] The script will try to write several netCDF ' 'files with lower file size.') else: check_fs = True else: check_fs = True return check_fs
[docs]def display_disk_stat(mds_size): """ Display hard drive statistics to user. Parameters ---------- mds_size : TYPE DESCRIPTION. Returns ------- mds_size_hr : TYPE DESCRIPTION. """ disk_stat = get_disk_stat() free_after = disk_stat[2] - mds_size disk_stat.append(free_after) disk_stat.append(mds_size) try: total_hr, used_hr, free_hr, free_after_hr, mds_size_hr = [ convert_size_hr(item) for item in disk_stat ] except ValueError as error: msg = f"[WARNING] Operation shall be aborted to avoid NO SPACE LEFT ON\ DEVICE error: {error}" mds_size_hr = (0, 'B') else: space = '-' * 37 msg = ''.join( (f"[INFO] {space}\n", f"[INFO] Total Disk Space (before operation) :" f" {total_hr[1]} {total_hr[0]} \n", f"[INFO] Used Disk Space (before operation) :" f" {used_hr[1]} {used_hr[0]} \n", f"[INFO] Free Disk Space (before operation) :" f" {free_hr[1]} {free_hr[0]} \n", f"[INFO] Operation to save dataset to Disk :" f" {mds_size_hr[1]} {mds_size_hr[0]} \n", f"[INFO] Free Disk Space (after operation) :" f" {free_after_hr[1]} {free_after_hr[0]} \n", f"[INFO] {space}")) print(''.join(("[INFO] CHECK DISK STATISTICS\n", msg))) return mds_size_hr
[docs]def get_file_pattern(outname, sep='-', rem=-1, advanced=True): """ Retrieve a ``file_pattern`` from a filename and advanced regex. Parameters ---------- outname : str Filename from which a pattern must be extracted. sep : str, optional Separator. The default is '-'. rem : TYPE, optional Removal parts. The default is -1. advanced : TYPE, optional Advanced regex. The default is True. Returns ------- file_pattern : str The ``file_pattern`` extracted from ``filename``. """ if 'pathlib' in str(type(outname)): outname = if advanced: file_pattern = outname.replace(outname.split(sep)[rem], '')[:-1] else: # To be coded pass return file_pattern
[docs]def get_years(ncfiles, sep='-'): """ Retrieve a list of years from a list of netCDF filenames. Parameters ---------- ncfiles : list List of filenames from which years will be extracted. sep : TYPE, optional Separator. The default is '-'. Returns ------- years : set List of years. """ years = set([str(f).split(sep)[-1][:4] for f in ncfiles]) return years
[docs]def get_ncfiles(target_directory, file_pattern=None, year=None): """ Retrieve list of files, based on parameters. Parameters ---------- target_directory : str DESCRIPTION. file_pattern : TYPE, optional DESCRIPTION. The default is None. year : TYPE, optional DESCRIPTION. The default is None. Returns ------- ncfiles : list List of strings containing absolute path to files. """ if 'str' in str(type(target_directory)): target_directory = Path(target_directory) if file_pattern and year: ncfiles = list(target_directory.glob(f'{file_pattern}*{year}*.nc')) elif file_pattern and not year: ncfiles = list(target_directory.glob(f'*{file_pattern}*.nc')) elif year and not file_pattern: ncfiles = list(target_directory.glob(f'*{year}*.nc')) else: ncfiles = list(target_directory.glob('*.nc')) return ncfiles
[docs]def set_outputfile(file_pattern, target_directory, target_out_directory=None, start_year=None, end_year=None): """ Set output filename based on variables. Parameters ---------- file_pattern : TYPE DESCRIPTION. target_directory : TYPE DESCRIPTION. target_out_directory : TYPE, optional DESCRIPTION. The default is None. start_year : TYPE, optional DESCRIPTION. The default is None. end_year : TYPE, optional DESCRIPTION. The default is None. Returns ------- outputfile : TYPE DESCRIPTION. """ if not target_out_directory: target_out_directory = Path(target_directory.parent, 'copernicus-processed-data') elif 'str' in str(type(target_out_directory)): target_out_directory = Path(target_out_directory) if not target_out_directory.exists(): target_out_directory.mkdir(parents=True) if start_year == end_year or not end_year: outputfile = target_out_directory / f'{file_pattern}-{start_year}.nc' else: outputfile = target_out_directory / \ f'{file_pattern}-{start_year}_{end_year}.nc' return outputfile
[docs]def over_write(outputfile): """ Ask user if overwrite action should be performed. Parameters ---------- outputfile : TYPE DESCRIPTION. Returns ------- ow : TYPE DESCRIPTION. """ ok_overwrite = True if outputfile.exists(): ok_overwrite = query( f'[ACTION] The file {outputfile} already exists. Do you want ' f'{BOLD}to overwrite{END} it?', 'no') return ok_overwrite
[docs]def del_ncfiles(ncfiles): """ Delete files. Parameters ---------- ncfiles : TYPE DESCRIPTION. Returns ------- bool DESCRIPTION. """ for fnc in ncfiles: try: fnc.unlink() except OSError as error: print(f'[ERROR]: {fnc} : {error.strerror}') print( '[INFO-NETCDF] All inputs netCDF files have been successfully deleted.' ) return True
[docs]def to_nc4(mds, outputfile): """ Convert file(s) to one single netCDF-4 file, based on computer limits. Parameters ---------- mds : TYPE DESCRIPTION. outputfile : TYPE DESCRIPTION. Returns ------- nc4 : TYPE DESCRIPTION. """ if 'xarray.core.dataset.Dataset' not in str(type(mds)): mds = xr.open_mfdataset(mds, combine='by_coords') if 'str' in str(type(outputfile)): outputfile = Path(outputfile) prepare_encoding = {} for variable in mds.data_vars: prepare_encoding[variable] = mds[variable].encoding prepare_encoding[variable]['zlib'] = True prepare_encoding[variable]['complevel'] = 1 encoding = {} for key_encod, var_encod in prepare_encoding.items(): encoding.update({ key_encod: { key: value for key, value in var_encod.items() if key != 'coordinates' } }) try: mds.to_netcdf(path=outputfile, mode='w', engine='netcdf4', encoding=encoding) except ValueError as error: print( f'[INFO-NETCDF] Convertion initialized but ended in error due to : {error}' ) nc4 = False else: real_file_size = convert_size_hr(outputfile.stat().st_size) space = '-' * 20 msg = ''.join((f"[INFO] {space}\n", f"[INFO-NETCDF] Output file :" f" {str(outputfile)}\n", f"[INFO-NETCDF] File format : netCDF-4\n", f"[INFO-NETCDF] File size : {real_file_size[0]}" f" {real_file_size[1]}\n", f"[INFO] {space}")) print(''.join(("[INFO] CONVERTING TO NETCDF4\n", msg))) nc4 = True return nc4
[docs]def to_csv(mds, outputfile): """ Convert file(s) to one single csv file, based on computer limits. Parameters ---------- mds : TYPE DESCRIPTION. outputfile : TYPE DESCRIPTION. Returns ------- csv : TYPE DESCRIPTION. """ if 'xarray.core.dataset.Dataset' not in str(type(mds)): mds = xr.open_mfdataset(mds, combine='by_coords') if 'str' in str(type(outputfile)): outputfile = Path(outputfile) msg2 = 'please contact support at:' csv = False force = False ms_excel_row_limit = 1_048_576 nb_grid_pts = reduce((lambda x, y: x * y), list([len(mds[c]) for c in mds.coords])) if nb_grid_pts > ms_excel_row_limit: print(f'[INFO-CSV] The total number of rows exceeds MS Excel limit.' f' It is {BOLD}NOT recommended{END} to continue.') force = query( f'[ACTION-CSV] Do you still want to create this CSV file with' f' {BOLD}{nb_grid_pts} rows{END} (though most computers will run out of memory)?', 'no') if nb_grid_pts < ms_excel_row_limit or force: try: dataframe = mds.to_dataframe().reset_index().dropna() outputfile = outputfile.with_suffix('.csv') dataframe.to_csv(outputfile, index=False) except IOError: print(f'[INFO-CSV] Convertion initialized but ended in error.') else: space = '-' * 18 msg = ''.join( (f"[INFO] {space}\n", f"[INFO-CSV] Output file :" f" {str(outputfile)}\n", f"[INFO-CSV] File format : Comma-Separated Values\n", f"[INFO-CSV] Preview Stat:\n {dataframe.describe()}\n", f"[INFO] {space}")) print(''.join(("[INFO] CONVERTING TO CSV\n", msg))) csv = True else: print('[WARNING-CSV] Writing to disk action has been aborted by user ' f'due to very high number of rows ({nb_grid_pts}) exceeding most ' 'computers and softwares limits (such as MS Excel).') print(' '.join( ('[INFO-CSV] A new function is under beta-version to handle ' 'this use case automatically.\n' '[ACTION-CSV] Usage:\n' 'cmemstb to_mfcsv PATH_TO_NETCDF_DIRECTORY PATH_TO_OUTPUT_DIRECTORY\n' '[INFO-CSV] To upvote this feature,', msg2))) try: mds.close() del mds except NameError: print(''.join(('[DEBUG] ', msg2))) return csv
[docs]def to_mfcsv(input_directory, output_directory, max_depth_level=None): """ Convert netcdf file(s) to multiple csv files, based on MS Excel Limits. Parameters ---------- input_directory : TYPE DESCRIPTION. output_directory : TYPE DESCRIPTION. max_depth_level : TYPE, optional DESCRIPTION. The default is None. Returns ------- mfcsv : TYPE DESCRIPTION. """ mfcsv = False if isinstance(input_directory, xr.Dataset): mds = input_directory else: try: # Either a string glob in the form "path/to/my/files/*.nc" # or an explicit list of files to open. mds = xr.open_mfdataset(input_directory, combine='by_coords') except Exception: input_directory = Path(input_directory) mds = xr.open_mfdataset( [str(item) for item in list(input_directory.glob('*.nc'))], combine='by_coords') if isinstance(output_directory, str): output_directory = Path(output_directory) try: if not output_directory.exists(): output_directory.mkdir(parents=True) print(f'[INFO] Directory successfully created : {output_directory}.') except Exception as exception: print(f"[ERROR] Failed to create directory due to {str(exception)}.") ms_excel_row_limit = 1_048_576 space = '-' * 17 nb_grid_pts = reduce((lambda x, y: x * y), list([len(mds[c]) for c in mds.coords])) if nb_grid_pts > ms_excel_row_limit: print(f"[INFO] The total number of rows for a single CSV file exceeds MS Excel limit.") variable_name = list(mds.data_vars.keys())[0] try: depth = len(mds.depth) if max_depth_level is None: depth = len(mds.depth) elif max_depth_level < 0: print(f"[ERROR] Maximum depth level must be a positive index" f" from 0 to {len(mds.depth)}") return mfcsv elif max_depth_level >= 0: depth = max_depth_level print(f"[INFO] As a consequence, the total number of CSV files " f"to be generated is: {len(mds.time) * (depth + 1)}") for t in range(len(mds.time)): for d in range(len(mds.depth)): if d > depth: break DF = mds.isel(depth=d, time=t).to_dataframe() if not DF[variable_name].dropna().empty: t_format = pd.to_datetime(str(DF['time'].values[0])).strftime("%Y%m%d") v_format = '_'.join([DF[column].name for column in DF if column not in ['lon', 'lat', 'longitude', 'latitude', 'depth', 'time']]) try: gb_format = '_'.join([str(len(mds[lonlat])) for lonlat in mds.coords if lonlat not in ['depth', 'time']]) except Exception as exception: print(f"[ERROR] Failed to set boundingbox: {str(exception)}") output_filename = f'CMEMS-time_{t_format}-depth_{d}-{v_format}.csv' else: output_filename = f'CMEMS-gridbox_{gb_format}-time_{t_format}-depth_{d}-{v_format}.csv' finally: output_fpath = output_directory / output_filename if not output_fpath.exists(): try: DF.dropna().to_csv(output_fpath) except Exception as exception: print(f"[ERROR] Failed to write to disk: {repr(exception)}.") else: msg = ''.join( (f"[INFO] {space}\n", f"[INFO-CSV] Output file :" f" {output_fpath}\n", f"[INFO-CSV] File format : Comma-Separated Values\n", f"[INFO-CSV] Preview Stat:\n {DF.dropna().describe()}\n", f"[INFO] {space}")) print(''.join(("[INFO] CONVERTING TO CSV\n", msg))) else: print(f"[INFO] The CSV file {output_filename} already exists" f" in {output_directory.absolute()}.") except AttributeError: print("[INFO] As a consequence, the total number of CSV files " f"to be generated is: {len(mds.time)}") for t in range(len(mds.time)): DF = mds.isel(time=t).to_dataframe() if not DF[variable_name].dropna().empty: t_format = pd.to_datetime(str(DF['time'].values[0])).strftime("%Y%m%d") v_format = '_'.join([DF[column].name for column in DF if column not in ['lon', 'lat', 'longitude', 'latitude', 'time']]) try: gb_format = '_'.join([str(len(mds[lonlat])) for lonlat in mds.coords if lonlat not in ['depth', 'time']]) except Exception as exception: print(f"[ERROR] Failed to set boundingbox: {str(exception)}") output_filename = f'CMEMS-time_{t_format}-{v_format}.csv' else: output_filename = f'CMEMS-gridbox_{gb_format}-time_{t_format}-{v_format}.csv' finally: output_fpath = output_directory / output_filename if not output_fpath.exists(): try: DF.dropna().to_csv(output_fpath) except Exception as exception: print(f"[ERROR] Failed to write to disk: {repr(exception)}.") else: msg = ''.join( (f"[INFO] {space}\n", f"[INFO-CSV] Output file :" f" {output_fpath}\n", f"[INFO-CSV] File format : Comma-Separated Values\n", f"[INFO-CSV] Preview Stat:\n {DF.dropna().describe()}\n", f"[INFO] {space}")) print(''.join(("[INFO] CONVERTING TO CSV\n", msg))) else: print(f"[INFO] The CSV file {output_filename} already exists" f" in {output_directory.absolute()}.") mfcsv = True return mfcsv
[docs]def to_nc4_csv(ncfiles, outputfile, skip_csv=False, default_nc_size=None): """ Convert file(s) to both netCDF-4 and csv files, based on computer limits. Parameters ---------- ncfiles : TYPE DESCRIPTION. outputfile : TYPE DESCRIPTION. skip_csv : TYPE, optional DESCRIPTION. The default is False. default_nc_size : TYPE, optional DESCRIPTION. The default is None. Returns ------- nc4 : bool DESCRIPTION. csv : bool DESCRIPTION. check_ow : bool DESCRIPTION. """ nc4 = False csv = False if not default_nc_size: default_nc_size = 16_000_000_000 mds_size = get_file_size(ncfiles) check_fs = check_file_size(mds_size, default_nc_size) check_ow = over_write(outputfile) check_ow_csv = over_write(outputfile.with_suffix('.csv')) if check_ow and check_fs: with xr.open_mfdataset(ncfiles, combine='by_coords') as mds: nc4 = to_nc4(mds, outputfile) elif not check_ow: print('[WARNING-NETCDF] Writing to disk action has been aborted by ' 'user due to already existing file.') elif not check_fs: skip_csv = True if check_ow_csv and not skip_csv: with xr.open_mfdataset(ncfiles, combine='by_coords') as mds: csv = to_csv(mds, outputfile) return nc4, csv, check_ow
[docs]def post_processing(outname, target_directory, target_out_directory=None, delete_files=True): """ Post-process the data already located on disk. Concatenate a complete timerange in a single netCDF-4 file, or if not possible, stack periods on minimum netCDF-4 files (either by year or by month). There is a possibility to delete old files to save space, thanks to convertion from nc3 to nc4 and to convert to `CSV`, if technically feasible. Parameters ---------- outname : TYPE DESCRIPTION. target_directory : TYPE DESCRIPTION. target_out_directory : TYPE, optional DESCRIPTION. The default is None. delete_files : TYPE, optional DESCRIPTION. The default is True. Raises ------ SystemExit DESCRIPTION. Returns ------- processing : bool DESCRIPTION. See Also -------- get_file_pattern : called from this method get_ncfiles : called from this method get_years : called from this method set_outputfile : called from this method to_nc4_csv : called from this method del_ncfiles : called from this method """ processing = False try: file_pattern = get_file_pattern(outname) except AttributeError: print(f'[ERROR] Program exits due to fatal error. There is no need ' 'to re-run this script if no action has been taken from user side.') raise SystemExit sel_files = get_ncfiles(target_directory, file_pattern) years = get_years(sel_files) try: single_outputfile = set_outputfile(file_pattern, target_directory, target_out_directory, start_year=min(years), end_year=max(years)) except ValueError as error: print( f'[ERROR] Processing failed due to no file matching pattern : {error}' ) else: nc4, csv, ow_choice = to_nc4_csv(sel_files, single_outputfile) if not nc4 and not csv and ow_choice: for year in years: print(year) ncfiles = get_ncfiles(target_directory, file_pattern, year) outfilemerged = set_outputfile(file_pattern, target_directory, target_out_directory, start_year=year) nc4, csv, ow_choice = to_nc4_csv(ncfiles, outfilemerged) if all([delete_files, nc4]): del_ncfiles(sel_files) processing = True return processing
[docs]def get(local_storage_directory=None, target_out_directory=None, view_myscript=None, user=None, pwd=None, forcestack=False, delete_files=True): """Download and post-process files to both compressed and tabular formats, if applicable. Download as many subsets of dataset required to fulfill an initial data request based on a template command, called ``VIEW SCRIPT`` generated by Copernicus Marine website ( Then, all files are post-processed locally. e.g to concatenate in a single file, to save space (thanks to nc3 -> nc4), to convert to ``CSV`` (if technically possible), and to delete old files. End-user is guided throughout the process if no parameter is declared. To get started, this function is the main entry point. Parameters ---------- local_storage_directory : TYPE, optional DESCRIPTION. The default is None. target_out_directory : TYPE, optional DESCRIPTION. The default is None. view_myscript : TYPE, optional DESCRIPTION. The default is None. user : TYPE, optional DESCRIPTION. The default is None. pwd : TYPE, optional DESCRIPTION. The default is None. forcestack : TYPE, optional DESCRIPTION. The default is False. delete_files : TYPE, optional DESCRIPTION. The default is True. Returns ------- True. See Also -------- process_viewscript : Method to parse `VIEW SCRIPT` post_processing : Method to convert downloaded data to other format Examples -------- Ex 1. Let the user be guided by the script with interactive questions: >>> cmemstb get Ex 2. Get data matching a ``VIEW SCRIPT`` template command passed as `parameter`: >>> cmemstb get --view_myscript="python -m motuclient --motu --service-id GLOBAL_ANALYSIS_FORECAST_PHY_001_024-TDS --product-id global-analysis-forecast-phy-001-024 --longitude-min -20 --longitude-max 45 --latitude-min 25 --latitude-max 72 --date-min \\"2019-08-18 12:00:00\\" --date-max \\"2020-08-31 12:00:00\\" --depth-min 0.493 --depth-max 0.4942 --variable thetao --out-dir <OUTPUT_DIRECTORY> --out-name <OUTPUT_FILENAME> --user <USERNAME> --pwd <PASSWORD>" Notes ----- For Windows Operating System Users and when using the ``--view_myscript`` as parameter, you might want to double check that ``double quote`` around dates are well escaped (see above example). """ target_directory = set_target_directory(local_storage_directory) outname = process_viewscript(target_directory=target_directory, view_myscript=view_myscript, user=user, pwd=pwd, forcestack=forcestack) post_processing(outname=outname, target_directory=target_directory, target_out_directory=target_out_directory, delete_files=delete_files) return True
[docs]def cli(): """ Method to enable Command Line Interface and to expose only useful method for beginners. Returns ------- None. """ fire.Fire({ 'display_disk_stat': display_disk_stat, 'get': get, 'get_credentials': get_credentials, 'get_data': get_data, 'get_file_pattern': get_file_pattern, 'get_ncfiles': get_ncfiles, 'post_processing': post_processing, 'process_viewscript': process_viewscript, 'set_target_directory': set_target_directory, 'to_nc4_csv': to_nc4_csv, 'to_nc4': to_nc4, 'to_csv': to_csv, 'to_mfcsv': to_mfcsv
}) if __name__ == '__main__': cli()