import re
import zipfile
from pathlib import Path
from typing import Dict, Optional, Tuple, Union, cast
from zipfile import ZipFile
import logging
import numpy as np
import xarray as xr
from numpy import ndarray
logger = logging.getLogger("openghg.transform.emissions")
logger.setLevel(logging.DEBUG) # Have to set level for logger as well as handler
ArrayType = Optional[Union[ndarray, xr.DataArray]]
[docs]
def parse_edgar(
datapath: Path,
date: str,
species: Optional[str] = None,
domain: Optional[str] = None,
lat_out: ArrayType = None,
lon_out: ArrayType = None,
# sector: Optional[str] = None,
# period: Optional[Union[str, tuple]] = None,
edgar_version: Optional[str] = None,
) -> Dict:
"""
Read and parse input EDGAR data.
Notes: Only accepts annual 2D grid maps in netcdf (.nc) format for now.
Does not accept monthly data yet.
EDGAR data is global on a 0.1 x 0.1 grid. This function allows products
to be created for a given year which cover specific regions (and matches
to the OpenGHG data schema, including units and coordinate names).
Region information can be specified as follows:
- To use a pre-defined domain use the domain keyword only.
- To define a new domain use the domain, lat_out, lon_out keywords
- If no domain or lat_out, lon_out data is supplied, the global EDGAR
data will be added labelled as "globaledgar" domain.
Pre-exisiting domains are defined within the openghg_defs "domain_info.json" file.
Metadata will also be added to the stored data including:
- "domain": domain (e.g. "europe") OR "globaledgar"
- "source": "anthro" (for "TOTAL"), source name from file otherwise
- "database": "EDGAR"
- "database_version": edgar_version (e.g. "v60", "v50", "v432")
Args:
datapath: Path to data folder or zip archive for EDGAR data
date: Year to extract. Expect a string of the form "YYYY"
species: Species name being extracted
domain: Domain name for new or pre-existing domain
lat_out: Latitude values for new domain
lon_out: Longitude values for new domain
edgar_version: EDGAR version in file. Will be inferred otherwise.
Returns:
dict: Dictionary of data
TODO: Allow date range to be extracted rather than year?
TODO: Add monthly parsing and sector stacking options
"""
import tempfile
from openghg.standardise.meta import assign_flux_attributes, define_species_label
from openghg.store import infer_date_range
from openghg.util import (
clean_string,
molar_mass,
synonyms,
find_coord_name,
convert_internal_longitude,
timestamp_now,
)
# Currently based on acrg.name.emissions_helperfuncs.getedgarannualtotals()
# Additional edgar functions which could be incorporated.
# - getedgarv5annualsectors
# - getedgarv432annualsectors
# - (monthly sectors?)
# TODO: Work out how to select frequency
# - could try and relate to period e.g. "monthly" versus "yearly" etc.
period = None
raw_edgar_domain = "globaledgar"
lat_out, lon_out = _check_lat_lon(domain, lat_out, lon_out)
if domain is None:
domain = raw_edgar_domain
# TODO: Add check for period? Only monthly or yearly (or equivalent inputs)
# Check if input is a zip file
if zipfile.is_zipfile(datapath):
zipped = True
zip_folder = zipfile.ZipFile(datapath)
else:
zipped = False
known_versions = _edgar_known_versions()
# Check readme file for edgar version (if not specified)
if zipped and edgar_version is None:
edgar_version = _check_readme_version(zippath=zip_folder)
elif edgar_version is None:
edgar_version = _check_readme_version(datapath=datapath)
# Extract list of data files
if zipped:
zip_filelist = zip_folder.infolist()
# folder_filelist = list(zip_folder.namelist())
folder_filelist = [Path(filename.filename) for filename in zip_filelist]
else:
folder_filelist = list(datapath.glob("*"))
# Extract netcdf files (only, for now) - ".txt" is also an option (not implemented)
suffix = ".nc"
data_files = [file for file in folder_filelist if file.suffix == suffix]
if not data_files:
raise ValueError("Expect EDGAR '.nc' files." f"No suitable files found within datapath: {datapath}")
for file in data_files:
try:
db_info = _extract_file_info(file)
except ValueError:
db_info = {}
continue
# Extract species from filename if not specified
try:
species_from_file: Optional[str] = db_info["species"]
except KeyError:
species_from_file = None
if species is None:
species = species_from_file
# Check synonyms and compare against filename value
if species is not None:
species_label = define_species_label(species)[0]
# species_label = synonyms(species).lower()
else:
raise ValueError("Unable to retrieve species from database filenames." " Please specify")
if species_from_file is not None and species_label != synonyms(species_from_file):
logger.warning(
"Input species does not match species extracted from",
" database filenames. Please check.",
)
# If version not yet found, extract version from file naming scheme
if edgar_version is None:
possible_version = db_info["version"]
possible_version_clean = clean_string(possible_version)
if possible_version_clean in known_versions:
edgar_version = possible_version_clean
edgar_version = clean_string(edgar_version)
if edgar_version not in known_versions:
if edgar_version is None:
raise ValueError(f"Unable to infer EDGAR version ({edgar_version})." f" Please pass as an argument (one of {known_versions})")
else:
raise ValueError(f"Unable to infer EDGAR version." f" Please pass as an argument (one of {known_versions})")
# TODO: May want to split out into a separate function, so we can use this for
# - yearly - "v6.0_CH4_2015_TOTALS.0.1x0.1.nc"
# - sectoral - "v6.0_CH4_2015_ENE.0.1x0.1.nc"
# - monthly sectoral - "v6.0_CH4_2015_1_ENE.0.1x0.1.nc", "v6.0_CH4_2015_2_ENE.0.1x0.1.nc", ...
if isinstance(date, int):
date = str(date)
if len(date) == 4:
year = int(date)
else:
raise ValueError(f"Do no accept date which does not represent a year yet: {date}")
files_by_year = {}
for file in data_files:
try:
file_info = _extract_file_info(file)
except ValueError:
continue
# Check if data is actually monthly "...2015_1" etc. - can't parse yet
if "month" in file_info:
raise NotImplementedError("Unable to parse monthly EDGAR data at present.")
year_from_file = file_info["year"]
files_by_year[year_from_file] = file
if year_from_file == year:
edgar_file = file
edgar_file_info = file_info
break
else:
all_years = list(files_by_year.keys())
all_years.sort()
start_year, end_year = all_years[0], all_years[-1]
if year < start_year:
raise ValueError(
f"EDGAR {edgar_version} range: {start_year}-{end_year}." f" {year} is before this period."
)
elif year > end_year:
logger.info(
f"Using last available year from EDGAR {edgar_version} range:" f"{start_year}-{end_year}."
)
edgar_file = files_by_year[end_year]
edgar_file_info = _extract_file_info(edgar_file)
# For a zipped archive need to unzip the netcdf file and place in a
# temporary directory.
if zipped:
temp_extract_folder = tempfile.TemporaryDirectory()
for zipinfo in zip_filelist:
if zipinfo.filename == edgar_file.name:
zip_folder.extract(zipinfo, path=temp_extract_folder.name)
edgar_file = temp_extract_folder.name / edgar_file
break
# Dimension - (lat, lon) - no time dimension
# time is not included in the file just in the filename *sigh*!
# v432_CH4_1978.0.1x0.1.nc (or .zip)
# v50_CH4_1978.0.1x0.1.nc (or .zip)
# v6.0_CH4_1978_TOTALS.0.1x0.1.nc
# v50_CO2_excl_short-cycle_org_C_1978.0.1x0.1.nc (or .zip)
# v50_CO2_org_short-cycle_C_1978.0.1x0.1.nc (or .zip)
# v50_N2O_1978.0.1x0.1.zip (or .zip)
with xr.open_dataset(edgar_file) as temp:
edgar_ds = temp
# Expected name e.g. "emi_ch4", "emi_co2"
name = f"emi_{species_label}"
# For reference, from "_readme.html" from v6.0 data:
# 'Yearly Emissions gridmaps in ton substance / 0.1degree x 0.1degree / year
# for the .txt files with longitude and latitude coordinates referring to
# the low-left corner of each grid-cell.'
# 'Monthly Emissions gridmaps in ton substance / 0.1degree x 0.1degree / month
# for the .txt files with longitude and latitude coordinates referring to
# the low-left corner of each grid-cell.'
# 'Emissions gridmaps in kg substance /m2 /s for the .nc files with longitude
# and latitude coordinates referring to the cell center of each grid-cell.'
# Convert from kg/m2/s to mol/m2/s
species_molar_mass = molar_mass(species_label)
kg_to_g = 1e3
flux_da = edgar_ds[name]
# flux_values: ndarray[Any, Any] = flux_da.values * kg_to_g / species_molar_mass
flux_da = flux_da * kg_to_g / species_molar_mass
units = "mol/m2/s"
lat_name = find_coord_name(flux_da, options=["lat", "latitude"])
lon_name = find_coord_name(flux_da, options=["lon", "longitude"])
if lat_name is None or lon_name is None:
raise ValueError(
f"Could not find '{lat_name}' or '{lon_name}' in EDGAR file.\n"
" Please check this is a 2D grid map."
)
# Check range of longitude values and convert to -180 - +180
flux_da = convert_internal_longitude(flux_da, lon_name=lon_name)
if lat_out is not None and lon_out is not None:
# Will produce import error if xesmf has not been installed.
from openghg.transform import regrid_uniform_cc
from openghg.util import cut_data_extent
# To improve performance of regridding algorithm cut down the data
# to match the output grid (with buffer).
flux_da_cut = cut_data_extent(flux_da, lat_out, lon_out)
flux_values = flux_da_cut.values
lat_in_cut = flux_da_cut[lat_name]
lon_in_cut = flux_da_cut[lon_name]
# regrid2d() used within acrg code for equivalent regrid function
# but switched to using xesmf (rather than iris) here instead.
flux_values = regrid_uniform_cc(flux_values, lat_out, lon_out, lat_in_cut, lon_in_cut)
else:
lat_out = flux_da[lat_name]
lon_out = flux_da[lon_name]
flux_values = flux_da.values
edgar_attrs = edgar_ds.attrs
# After the data has been extracted and used from the unzipped netcdf
# file clean up and remove temporary directory and file.
if zipped:
temp_extract_folder.cleanup()
# Check for "time" dimension and add if missing.
flux_ndim = flux_values.ndim
time_name = "time"
if time_name in flux_da:
time = flux_da[time_name].values
elif time_name not in flux_da and flux_ndim == 2:
time = np.array([f"{year}-01-01"], dtype="datetime64[ns]")
flux = flux_values[np.newaxis, ...]
elif flux_ndim != 3:
raise ValueError(f"Expected '{name}' to contain 2 or 3 dimensions. Actually: {flux_ndim}")
dims = ("time", "lat", "lon")
em_data = xr.Dataset(
{"flux": (dims, flux)}, coords={"time": time, "lat": lat_out, "lon": lon_out}, attrs=edgar_attrs
)
# Some attributes are numpy types we can't serialise to JSON so convert them
# to their native types here
attrs = {}
for key, value in em_data.attrs.items():
try:
attrs[key] = value.item()
except AttributeError:
attrs[key] = value
author_name = "OpenGHG Cloud"
em_data.attrs["author"] = author_name
source_from_file = edgar_file_info["source"]
if source_from_file in ("TOTALS", ""):
source = "anthro"
elif species_label == "co2" and "TOTALS" in source_from_file:
co2_source = "_".join(source_from_file.split("_")[:-1])
source = clean_string(f"{co2_source}_anthro")
else:
source = clean_string(source_from_file)
database = "EDGAR"
database_version = clean_string(edgar_version)
metadata = {}
metadata.update(attrs)
metadata["species"] = species_label
metadata["domain"] = domain
metadata["source"] = source
metadata["date"] = date
metadata["database"] = database
metadata["database_version"] = database_version
metadata["author"] = author_name
metadata["processed"] = str(timestamp_now())
metadata["data_type"] = "emissions"
attrs = {"author": metadata["author"], "processed": metadata["processed"]}
# Infer the date range associated with the flux data
em_time = em_data.time
start_date, end_date, period_str = infer_date_range(em_time, filepath=edgar_file.name, period=period)
prior_info_dict = {
"EDGAR": {
"version": f"EDGAR {edgar_version}",
"filename": edgar_file.name,
"raw_resolution": "0.1 degrees x 0.1 degrees",
"reference": edgar_ds.attrs["source"],
}
}
metadata["start_date"] = str(start_date)
metadata["end_date"] = str(end_date)
metadata["min_longitude"] = round(float(em_data["lon"].min()), 5)
metadata["max_longitude"] = round(float(em_data["lon"].max()), 5)
metadata["min_latitude"] = round(float(em_data["lat"].min()), 5)
metadata["max_latitude"] = round(float(em_data["lat"].max()), 5)
metadata["time_resolution"] = "standard"
metadata["time_period"] = period_str
key = "_".join((species_label, source, domain, date))
emissions_data: Dict[str, dict] = {}
emissions_data[key] = {}
emissions_data[key]["data"] = em_data
emissions_data[key]["metadata"] = metadata
emissions_data[key]["attributes"] = attrs
emissions_data = assign_flux_attributes(emissions_data, units=units, prior_info_dict=prior_info_dict)
return emissions_data
def _check_lat_lon(
domain: Optional[str] = None, lat_out: ArrayType = None, lon_out: ArrayType = None
) -> Tuple[Optional[ndarray], Optional[ndarray]]:
"""
Define and check latitude and longitude values for a domain.
The domain can be used in one of two ways:
1. To specify a pre-exisiting lat, lon extent which can be extracted
2. To supply a name for a new lat, lon extent which must be specified
For case 1, only domain needs to be specified (lat_out and lon_out can
be specified but they must already exactly match the domain definition).
The details will be extracted from openghg_defs "domain_info.json" file.
For case 2, the domain, lat_out and lon_out must all be specified.
If none of these values are specified, (None, None) will be returned. This
is valid behaviour.
Args:
domain: Domain name for a pre-existing domain or a new domain
lat_out: Latitude values (only needed if domain is new)
lon_out: Longitude values (only needed if domain is new)
Returns:
ndarray, ndarray: Latitude and longitude arrays
None, None: if all inputs are None, a tuple of Nones will be returned.
"""
from openghg.util import convert_longitude, find_domain
if lat_out is not None or lon_out is not None:
if domain is None:
raise ValueError(
"Please specify new 'domain' name if selecting new" " latitude, longitude values"
)
if isinstance(lat_out, xr.DataArray):
lat_out = cast(ndarray, lat_out.values)
if isinstance(lon_out, xr.DataArray):
lon_out = cast(ndarray, lon_out.values)
if domain is not None:
# If domain is specified, attempt to extract lat/lon values from
# pre-defined definitions.
try:
lat_domain, lon_domain = find_domain(domain)
except ValueError:
# If domain cannot be found and lat, lon values have not been
# defined raise an error.
if lat_out is None or lon_out is None:
raise ValueError("To create new domain please input" " 'lat_out' and 'lon_out' values.")
else:
# Check domain latitude and longitude values against any
# lat_out and lon_out values specified to check they match.
if lat_out is not None:
if not np.array_equal(lat_domain, lat_out):
raise ValueError(
"Latitude values should not be specified"
f" when using pre-defined domain {domain}"
" (values don't match)."
)
else:
lat_out = lat_domain
if lon_out is not None:
if not np.array_equal(lon_domain, lon_out):
raise ValueError(
"Longitude values should not be specified"
f" when using pre-defined domain {domain}"
" (values don't match)."
)
else:
lon_out = lon_domain
if lon_out.max() > 180 or lon_out.min() < -180:
raise ValueError(
"Invalid domain definition."
" Expected longitude in range: -180 - 180."
f"Current longitude: {lon_out.min()} - {lon_out.max()}"
)
if lon_out is not None and (lon_out.max() > 180 or lon_out.min() < -180):
logger.info("Converting longitude to stay within -180 - 180 bounds")
lon_converted = convert_longitude(lon_out)
lon_out = cast(Optional[ndarray], lon_converted)
return lat_out, lon_out
def _edgar_known_versions() -> list:
"""Define list of known versions for the EDGAR database"""
known_versions = ["v432", "v50", "v60"]
return known_versions
def _check_readme_version(
datapath: Optional[Path] = None, zippath: Optional[ZipFile] = None
) -> Optional[str]:
"""
Attempts to extract the edgar version from the associated "_readme.html"
file, if present.
Args:
datapath : Path to the folder containing the downloaded EDGAR files
zippath: Path to zipped archive file (direct from EDGAR)
Returns:
str : edgar version if found (None otherwise)
"""
# Work out version if possible from readme
# All database versions so far may contain "_readme.html" file
# "v6.0"
# - "TOTALS_nc.zip" is what is downloaded from website
# - "_readme.html" title line: "<title>EDGAR v6.0_GHG (2021)</title>"
# "v5.0"
# - "v50_CH4_1970_2015.zip" can be downloaded
# - "_readme.html" title line: "<title>EDGAR v5.0 (2019)</title>"
# "v4.3.2"
# - "v432_CH4_1970_2012.zip" can be downloaded
# - "_readme.html" title line: "<title>EDGAR v4.3.2 (2017)</title>"
# Check for readme html file and, if present, extract version
readme_filename = "_readme.html"
if zippath is not None:
try:
# Cast extracted bytes to a str object
readme_data: Optional[str] = str(zippath.read(readme_filename))
except ValueError:
readme_data = None
elif datapath is not None:
readme_filepath = datapath.joinpath(readme_filename)
if readme_filepath.exists():
readme_data = readme_filepath.read_text()
else:
readme_data = None
else:
raise ValueError("One of datapath or zippath must be specified.")
if readme_data is not None:
try:
# Ignoring types as issues caught by try-except statement
# Find and extract title line from html file
title_line = re.search("<title.*?>(.+?)</title>", readme_data).group() # type: ignore
# Extract version e.g. "v6.0" or "v4.3.2"
edgar_version = re.search(r"v\d[.]\d[.]?\d*", title_line).group() # type: ignore
except ValueError:
pass
else:
# Check against known versions and remove '.' if these don't match.
known_versions = _edgar_known_versions()
if edgar_version not in known_versions:
edgar_version = edgar_version.replace(".", "")
else:
edgar_version = None
return edgar_version
def _extract_file_info(edgar_file: Union[Path, str]) -> Dict:
"""
Extract details from EDGAR filename.
Expected filenames roughly of the form:
- {version}_{species}_{year}.{resolution}.nc
- {version}_{species}_{year}_{source}.{resolution}.nc
- {version}_{species}_{year}_{month}.._{source..}.{resolution}.nc
Args:
edgar_file: Filename for an EDGAR database file
Returns:
Dict : Elements of the filename as a dictionary
>>> _extract_file_info("v6.0_CH4_2015_TOTALS.0.1x0.1.nc")
{"version": "v6.0", "species": "CH4", "year": 2015,
"source": "TOTALS", "resolution": "0.1x0.1"}
>>> _extract_file_info("v50_CH4_2015.0.1x0.1.nc")
{"version": "v50", "species": "CH4", "year": 2015,
"source": "", "resolution": "0.1x0.1"}
>>> _extract_file_info("v432_CH4_2010_9_IPCC_6A_6D.0.1x0.1.nc")
{"version": "v432", "species": "CH4", "year": 2010, "month": 9,
"source": "IPCC-6A-6D", "resolution": "0.1x0.1"}
"""
# Can extract details about files from the filename itself.
# e.g. "v6.0_CH4_2015_TOTALS.0.1x0.1.nc"
# Extract filename stem (name without extension) and split
edgar_file = Path(edgar_file)
filename = edgar_file.stem
filename_split = filename.split("_")
# Check if we can extract the known components from the filename
# - version, species (upper), year
try:
version = filename_split[0]
species = filename_split[1]
except IndexError:
raise ValueError(f"Did not recognise input file format: {filename}")
else:
index_remaining = 2
# CO2 input has 2 options e.g.
# - "v6.0_CO2_excl_short-cycle_org_C_2015_TOTALS.0.1x0.1.nc"
# - "v6.0_CO2_org_short-cycle_C_1970_TOTALS.0.1x0.1.nc"
if species.lower() == "co2":
co2_options = ["excl_short-cycle_org_C", "org_short-cycle_C"]
for option in co2_options:
if option in filename:
option_split = option.split("_")
extra_sections = len(option_split)
index_remaining += extra_sections
source = "_".join(option_split[0:2]) + "_"
break
else:
source = ""
else:
source = ""
# Check if year can be cast to integer to check this is a valid value
try:
year_str = filename_split[index_remaining]
year = int(year_str)
except IndexError:
raise ValueError(f"Unable to cast year extracted from file format to an integer: {year_str}")
except ValueError:
# In some files there is no source specified so
# filename_split[2] contains the year and resolution
# e.g. "v50_CH4_2015.0.1x0.1.nc" --> "2015.0.1x0.1"
try:
year = int(year_str.split(".")[0])
except ValueError:
raise ValueError(f"Could not find valid year value from file: {filename}")
else:
index_remaining += 1
# Check whether month is included in filename
# e.g. "v6.0_CH4_2015_1_ENE.0.1x0.1.nc"
try:
month: Optional[int] = int(filename_split[3])
except (IndexError, ValueError):
month = None
else:
index_remaining += 1
# Attempt to extract source(s) and resolution from filename stem
# e.g. "v6.0_CH4_2015_TOTALS.0.1x0.1.nc" --> "TOTALS.0.1x0.1"
# e.g. "v50_CH4_2015.0.1x0.1.nc" --> "2015.0.1x0.1" (note no source in filename)
# e.g. "v432_CH4_2010_9_IPCC_6A_6D.0.1x0.1.nc" --> "IPCC_6A_6D.0.1x0.1"
try:
source_resolution = "-".join(filename_split[index_remaining:])
except (IndexError, ValueError):
raise ValueError(f"Unable to extract source: {filename}")
else:
# e.g. "TOTALS.0.1x0.1" --> "TOTALS", "0.1x0.1"
# e.g. "2015.0.1x0.1" --> "2015", "0.1x0.1" --> "", "0.1x0.1"
# e.g. "IPCC_6A_6D.0.1x0.1" --> "IPCC-6A-6D", "0.1x0.1"
em_source = source_resolution.split(".")[0]
resolution = source_resolution.lstrip(em_source).lstrip(".")
# Check source was actually contained in filename and not just the year
# If so, set source to contain empty string
if em_source == str(year):
source += ""
else:
source += em_source
file_info = {
"version": version,
"species": species,
"year": year,
"source": source,
"resolution": resolution,
}
if month is not None:
file_info["month"] = month
return file_info
# def getedgarv5annualsectors(year, lon_out, lat_out, edgar_sectors, species='CH4'):
# """
# Get annual emission totals for species of interest from EDGAR v5.0 data
# for sector or sectors.
# Regrids to the desired lats and lons.
# CURRENTLY ONLY 2012 AND 2015 ANNUAL SECTORS IN SHARED DIRECTORY. OTHER YEARS NEED DOWNLOADING.
# Args:
# year (int):
# Year of interest
# lon_out (array):
# Longitudes to output the data on
# lat_out (array):
# Latitudes to output the data on
# edgar_sectors (list of str) (optional):
# EDGAR sectors to include. If list of values, the sum of these will be used.
# See below for list of possible sectors and full names.
# species (str):
# Which species you want to look at.
# e.g. species = 'CH4'
# Default = 'CH4'
# Currently only works for CH4.
# Returns:
# narr (array):
# Array of regridded emissions in mol/m2/s.
# Dimensions are [lat, lon]
# If there is no data for the species you are looking at you may have to
# download it from:
# https://edgar.jrc.ec.europa.eu/overview.php?v=50_GHG
# and place in:
# /data/shared/Gridded_fluxes/<species>/EDGAR_v5.0/yearly_sectoral/
# Note:
# EDGAR sector names:
# "AGS" = Agricultural soils
# "AWB" = Agricultural waste burning
# "CHE" = Chemical processes
# "ENE" = Power industry
# "ENF" = Enteric fermentation
# "FFF" = Fossil fuel fires
# "IND" = Combustion for manufacturing
# "IRO" = Iron and steel production
# "MNM" = Maure management
# "PRO_COAL" = Fuel exploitation - coal
# "PRO_GAS" = Fuel exploitation - gas
# "PRO_OIL" = Fuel expoitation - oil
# "PRO" = Fuel exploitation - contains coal, oil, gas
# "RCO" = Energy for buildings
# "REF_TRF" = Oil refineries and transformational industries
# "SWD_INC" = Solid waste disposal - incineration
# "SWD_LDF" = Solid waste disposal - landfill
# "TNR_Aviation_CDS" = Aviation - climbing and descent
# "TNR_Aviation_CRS" = Aviation - cruise
# "TNR_Aviation_LTO" = Aviation - landing and takeoff
# "TNR_Other" = Railways, pipelines and off-road transport
# "TNR_Ship" = Shipping
# "TRO" = Road transportation
# "WWT" = Waste water treatment
# """
# edgarfp = os.path.join(data_path,"Gridded_fluxes",species.upper(),"EDGAR_v5.0/yearly_sectoral")
# EDGARsectorlist = ["AGS","AWB","CHE","ENE","ENF","FFF","IND","IRO","MNM",
# "PRO_COAL","PRO_GAS","PRO_OIL","PRO","RCO","REF_TRF","SWD_INC",
# "SWD_LDF","TNR_Aviation_CDS","TNR_Aviation_CRS",
# "TNR_Aviation_LTO","TNR_Other","TNR_Ship","TRO","WWT"]
# if edgar_sectors is not None:
# print('Including EDGAR sectors.')
# for EDGARsector in edgar_sectors:
# if EDGARsector not in EDGARsectorlist:
# print('EDGAR sector {0} not one of: \n {1}'.format(EDGARsector,EDGARsectorlist))
# print('Returning None')
# return None
# #edgar flux in kg/m2/s
# for i,sector in enumerate(edgar_sectors):
# edgarfn = "v50_" + species.upper() + "_" + str(year) + "_" + sector + ".0.1x0.1.nc"
# with xr.open_dataset(os.path.join(edgarfp,edgarfn)) as edgar_file:
# edgar_flux = np.nan_to_num(edgar_file['emi_'+species.lower()].values,0.)
# edgar_lat = edgar_file.lat.values
# edgar_lon = edgar_file.lon.values
# if i == 0:
# edgar_total = edgar_flux
# else:
# edgar_total = np.add(edgar_total,edgar_flux)
# edgar_regrid_kg,arr = regrid2d(edgar_total,edgar_lat,edgar_lon,lat_out,lon_out)
# #edgar flux in mol/m2/s
# speciesmm = molar_mass(species)
# edgar_regrid = (edgar_regrid_kg.data*1e3) / speciesmm
# return(edgar_regrid)
# def getedgarv432annualsectors(year, lon_out, lat_out, edgar_sectors, species='CH4'):
# """
# Get annual emission totals for species of interest from EDGAR v4.3.2 data
# for sector or sectors.
# Regrids to the desired lats and lons.
# If there is no data for the species you are looking at you may have to
# download it from:
# http://edgar.jrc.ec.europa.eu/overview.php?v=432_GHG&SECURE=123
# and placed in:
# /data/shared/Gridded_fluxes/<species>/EDGAR_v4.3.2/<species>_sector_yearly/
# Args:
# year (int):
# Year of interest
# lon_out (array):
# Longitudes to output the data on
# lat_out (array):
# Latitudes to output the data on
# edgar_sectors (list):
# List of strings of EDGAR sectors to get emissions for.
# These will be combined to make one array.
# See 'Notes' for names of sectors
# species (str):
# Which species you want to look at.
# e.g. species = 'CH4'
# Default = 'CH4'
# Returns:
# narr (array):
# Array of regridded emissions in mol/m2/s.
# Dimensions are [lat, lon]
# Notes:
# Names of EDGAR sectors:
# 'powerindustry';
# 'oilrefineriesandtransformationindustry';
# 'combustionformanufacturing';
# 'aviationclimbinganddescent';
# 'aviationcruise';
# 'aviationlandingandtakeoff';
# 'aviationsupersonic';
# 'roadtransport';
# 'railwayspipelinesandoffroadtransport';
# 'shipping';
# 'energyforbuildings';
# 'fuelexploitation';
# 'nonmetallicmineralsproduction';
# 'chemicalprocesses';
# 'ironandsteelproduction';
# 'nonferrousmetalsproduction';
# 'nonenergyuseoffuels';
# 'solventsandproductsuse';
# 'entericfermentation';
# 'manuremanagement';
# 'agriculturalsoils';
# 'indirectN2Oemissionsfromagriculture';
# 'agriculturalwasteburning';
# 'solidwastelandfills';
# 'wastewaterhandling';
# 'Solid waste incineration';
# 'fossilfuelfires';
# 'indirectemissionsfromNOxandNH3';
# """
# species = species.upper() #Make sure species is uppercase
# #Path to EDGAR files
# edpath = os.path.join(data_path,'Gridded_fluxes/'+species+'/EDGAR_v4.3.2/'+species+'_sector_yearly/')
# #Dictionary of codes for sectors
# secdict = {'powerindustry' : '1A1a',
# 'oilrefineriesandtransformationindustry' : '1A1b_1A1c_1A5b1_1B1b_1B2a5_1B2a6_1B2b5_2C1b',
# 'combustionformanufacturing' : '1A2',
# 'aviationclimbinganddescent' : '1A3a_CDS',
# 'aviationcruise' : '1A3a_CRS',
# 'aviationlandingandtakeoff' : '1A3a_LTO',
# 'aviationsupersonic' : '1A3a_SPS',
# 'roadtransport' : '1A3b',
# 'railwayspipelinesandoffroadtransport' : '1A3c_1A3e',
# 'shipping' : '1A3d_1C2',
# 'energyforbuildings' : '1A4',
# 'fuelexploitation' : '1B1a_1B2a1_1B2a2_1B2a3_1B2a4_1B2c',
# 'nonmetallicmineralsproduction' : '2A',
# 'chemicalprocesses': '2B',
# 'ironandsteelproduction' : '2C1a_2C1c_2C1d_2C1e_2C1f_2C2',
# 'nonferrousmetalsproduction' : '2C3_2C4_2C5',
# 'nonenergyuseoffuels' : '2G',
# 'solventsandproductsuse' : '3',
# 'entericfermentation' : '4A',
# 'manuremanagement' : '4B',
# 'agriculturalsoils' : '4C_4D',
# 'indirectN2Oemissionsfromagriculture' : '4D3',
# 'agriculturalwasteburning' : '4F',
# 'solidwastelandfills' : '6A_6D',
# 'wastewaterhandling' : '6B',
# 'Solid waste incineration' : '6C',
# 'fossilfuelfires' : '7A',
# 'indirectemissionsfromNOxandNH3' : '7B_7C'
# }
# #Check to see range of years. If desired year falls outside of this range
# #then take closest year
# possyears = np.empty(shape=[0,0],dtype=int)
# for f in glob.glob(edpath+'v432_'+species+'_*'):
# fname = f.split('/')[-1]
# fyear = fname[9:13] #Extract year from filename
# possyears = np.append(possyears, int(fyear))
# if year > max(possyears):
# print("%s is later than max year in EDGAR database" % str(year))
# print("Using %s as the closest year" % str(max((possyears))))
# year = max(possyears)
# if year < min(possyears):
# print("%s is earlier than min year in EDGAR database" % str(year))
# print("Using %s as the closest year" % str(min((possyears))))
# year = min(possyears)
# #Species molar mass
# speciesmm = molar_mass(species)
# # if species == 'CH4':
# # #speciesmm = 16.0425
# # speciesmm = molar_mass(species)
# # elif species == 'N2O':
# # speciesmm = 44.013
# # else:
# # print "No molar mass for species %s." % species
# # print "Please add this and rerun the script"
# # print "Returning None"
# # return(None)
# #Read in EDGAR data of annual mean CH4 emissions for each sector
# #These are summed together
# #units are in kg/m2/s
# tot = None
# for sec in edgar_sectors:
# edgar = edpath+'v432_'+species+'_'+str(year)+'_IPCC_'+secdict[sec]+'.0.1x0.1.nc'
# if os.path.isfile(edgar):
# ds = xr.open_dataset(edgar)
# soiname = 'emi_'+species.lower()
# if tot is None:
# tot = ds[soiname].values*1e3 / speciesmm
# else:
# tot += ds[soiname].values*1e3 / speciesmm
# else:
# print('No annual file for sector %s and %s' % (sec, species))
# lat_in = ds.lat.values
# lon_in = ds.lon.values
# nlat = len(lat_out)
# nlon = len(lon_out)
# narr = np.zeros((nlat, nlon))
# narr, reg = regrid2d(tot, lat_in, lon_in,
# lat_out, lon_out)
# return(narr)
# def getedgarmonthlysectors(lon_out, lat_out, edgar_sectors, months=[1,2,3,4,5,6,7,8,9,10,11,12],
# species='CH4'):
# """
# Get 2010 monthly emissions for species of interest from EDGAR v4.3.2 data
# for sector or sectors.
# Regrids to the desired lats and lons.
# If there is no data for the species you are looking at you may have to
# download it from:
# http://edgar.jrc.ec.europa.eu/overview.php?v=432_GHG&SECURE=123
# and place it in:
# /data/shared/Gridded_fluxes/<species>/EDGAR_v4.3.2/<species>_sector_monthly/
# Args:
# lon_out (array):
# Longitudes to output the data on
# lat_out (array):
# Latitudes to output the data on
# edgar_sectors (list):
# List of strings of EDGAR sectors to get emissions for.
# These will be combined to make one array.
# See 'Notes' for names of sectors
# months (list of int; optional):
# Desired months.
# species (str, optional):
# Which species you want to look at.
# e.g. species = 'CH4'
# Default = 'CH4'
# Returns:
# narr (array):
# Array of regridded emissions in mol/m2/s.
# Dimensions are [no of months, lat, lon]
# Notes:
# Names of EDGAR sectors:
# 'powerindustry';
# 'oilrefineriesandtransformationindustry';
# 'combustionformanufacturing';
# 'aviationclimbinganddescent';
# 'aviationcruise';
# 'aviationlandingandtakeoff';
# 'aviationsupersonic';
# 'roadtransport';
# 'railwayspipelinesandoffroadtransport';
# 'shipping';
# 'energyforbuildings';
# 'fuelexploitation';
# 'nonmetallicmineralsproduction';
# 'chemicalprocesses';
# 'ironandsteelproduction';
# 'nonferrousmetalsproduction';
# 'nonenergyuseoffuels';
# 'solventsandproductsuse';
# 'entericfermentation';
# 'manuremanagement';
# 'agriculturalsoils';
# 'indirectN2Oemissionsfromagriculture';
# 'agriculturalwasteburning';
# 'solidwastelandfills';
# 'wastewaterhandling';
# 'Solid waste incineration';
# 'fossilfuelfires';
# 'indirectemissionsfromNOxandNH3';
# """
# species = species.upper() #Make sure species is uppercase
# #Path to EDGAR files
# edpath = os.path.join(data_path,'Gridded_fluxes/'+species+'/EDGAR_v4.3.2/'+species+'_sector_monthly/')
# #Dictionary of codes for sectors
# secdict = {'powerindustry' : '1A1a',
# 'oilrefineriesandtransformationindustry' : '1A1b_1A1c_1A5b1_1B1b_1B2a5_1B2a6_1B2b5_2C1b',
# 'combustionformanufacturing' : '1A2',
# 'aviationclimbinganddescent' : '1A3a_CDS',
# 'aviationcruise' : '1A3a_CRS',
# 'aviationlandingandtakeoff' : '1A3a_LTO',
# 'aviationsupersonic' : '1A3a_SPS',
# 'roadtransport' : '1A3b',
# 'railwayspipelinesandoffroadtransport' : '1A3c_1A3e',
# 'shipping' : '1A3d_1C2',
# 'energyforbuildings' : '1A4',
# 'fuelexploitation' : '1B1a_1B2a1_1B2a2_1B2a3_1B2a4_1B2c',
# 'nonmetallicmineralsproduction' : '2A',
# 'chemicalprocesses': '2B',
# 'ironandsteelproduction' : '2C1a_2C1c_2C1d_2C1e_2C1f_2C2',
# 'nonferrousmetalsproduction' : '2C3_2C4_2C5',
# 'nonenergyuseoffuels' : '2G',
# 'solventsandproductsuse' : '3',
# 'entericfermentation' : '4A',
# 'manuremanagement' : '4B',
# 'agriculturalsoils' : '4C_4D',
# 'indirectN2Oemissionsfromagriculture' : '4D3',
# 'agriculturalwasteburning' : '4F',
# 'solidwastelandfills' : '6A_6D',
# 'wastewaterhandling' : '6B',
# 'Solid waste incineration' : '6C',
# 'fossilfuelfires' : '7A',
# 'indirectemissionsfromNOxandNH3' : '7B_7C'
# }
# print('Note that the only year for monthly emissions is 2010 so using that.')
# #Species molar mass
# speciesmm = molar_mass(species)
# # if species == 'CH4':
# # speciesmm = 16.0425
# # elif species == 'N2O':
# # speciesmm = 44.013
# # else:
# # print "No molar mass for species %s." % species
# # print "Please add this and rerun the script"
# # print "Returning None"
# # return(None)
# #Read in EDGAR data of annual mean CH4 emissions for each sector
# #These are summed together
# #units are in kg/m2/s
# warnings = []
# first = 0
# for month in months:
# tot = np.array(None)
# for sec in edgar_sectors:
# edgar = edpath+'v432_'+species+'_2010_'+str(month)+'_IPCC_'+secdict[sec]+'.0.1x0.1.nc'
# if os.path.isfile(edgar):
# ds = xr.open_dataset(edgar)
# soiname = 'emi_'+species.lower()
# if tot.any() == None:
# tot = ds[soiname].values*1e3 / speciesmm
# else:
# tot += ds[soiname].values*1e3 / speciesmm
# else:
# warnings.append('No monthly file for sector %s' % sec)
# #print 'No monthly file for sector %s' % sec
# if first == 0:
# emissions = np.zeros((len(months), tot.shape[0], tot.shape[1]))
# emissions[0,:,:] = tot
# else:
# first += 1
# emissions[first,:,:] = tot
# for warning in np.unique(warnings):
# print(warning)
# lat_in = ds.lat.values
# lon_in = ds.lon.values
# nlat = len(lat_out)
# nlon = len(lon_out)
# narr = np.zeros((nlat, nlon, len(months)))
# for i in range(len(months)):
# narr[:,:,i], reg = regrid2d(emissions[i,:,:], lat_in, lon_in,
# lat_out, lon_out)
# return(narr)