"""Load and check against controlled vocabularies.
This module provides a class to load controlled vocabularies, either from JSON files or
from URLs. It also provides a method to check a value against a controlled vocabulary.
"""
import os
import re
import json
import requests
from typing import Dict, List, Union, Any
import time
from .config import get_config
conf = get_config()
vocabs_dir: str = conf["settings"]["vocabs_dir"]
vocabs_prefix: str = conf["settings"]["vocabs_prefix"]
WILDCARD = ["__all__"]
[docs]class Vocabs:
"""Load and check against controlled vocabularies.
This class provides methods to load controlled vocabularies from JSON files or from
URLs on GitHub or the CEDA Vocab Service. It also provides a method to check a
value against a value or list of values from those controlled vocabularies.
Attributes:
_vocabs: A dictionary of controlled vocabularies, where the keys are the
vocabulary IDs and the values are the vocabularies themselves.
"""
def __init__(self) -> None:
"""Initialise the Vocabs class."""
self._vocabs = {}
def _load(self, vocab_id: str) -> None:
"""Load a specific vocabulary file based on the vocab_id.
Loads vocabulary file from the vocabs directory and stores it in the _vocabs
dictionary.
Args:
vocab_id: The name of the vocabulary to load, without the ".json" ending.
"""
vocab_file = os.path.join(vocabs_dir, f"{vocab_id}.json")
self._vocabs[vocab_id] = json.load(open(vocab_file))
def _load_from_url_github(self, vocab_id_url: str) -> Dict[str, Any]:
"""Load a specific vocabulary from a GitHub URL.
Loads vocabulary file from a GitHub URL and stores it in the _vocabs
dictionary. If the URL contains "__latest__", it will be replaced with the
latest release version of the repository.
Args:
vocab_id_url: The URL of the vocabulary to load.
Returns:
The JSON loaded vocabulary.
"""
vocab_list = {}
vocab_id_url_base = vocab_id_url.split("/__latest__")[0]
vocab_id_url_base = vocab_id_url_base.replace(
"raw.githubusercontent.com", "github.com"
)
if "/__latest__/" in vocab_id_url:
latest_version = self._get_url(
f"{vocab_id_url_base}/releases/latest"
).url.split("/")[-1]
vocab_id_url = vocab_id_url.replace("__latest__", latest_version)
res = self._get_url(vocab_id_url.replace("__URL__", "https://"))
if res.status_code != 200:
print(f"[WARNING] Failed to load vocab: {vocab_id_url}")
return vocab_list
vocab_list = res.json()
return vocab_list
def _load_from_url_esacci(self, vocab_id_url: str) -> List[str]:
"""Load a specific vocabulary for ESA CCI.
Loads vocabulary file for the European Space Agency Climate Change Initiative
format from the CEDA Vocab Server and return values in that vocabulary.
Args:
vocab_id_url: The URL of the vocabulary to load.
Returns:
List of values from the vocabulary.
"""
vocab_list = []
res = requests.get(vocab_id_url)
if res.status_code != 200:
print(f"[WARNING] Failed to load vocab: {vocab_id_url}")
return vocab_list
js = res.json()
if 'dataType' in vocab_id_url:
vocab_list=sorted([altLabel[0]["@value"] for js_dct in js for key, altLabel in js_dct.items() if key.endswith("#altLabel")])
elif 'product' in vocab_id_url:
vocab_list=sorted([prefLabel[0]["@value"] for js_dct in js for key, prefLabel in js_dct.items() if key.endswith("#prefLabel")])
else:
print(f"[WARNING] ESA CCI vocab url not recognised: {vocab_id_url}")
return vocab_list
def _load_from_url(self, vocab_id: str) -> None:
"""Load specific vocabulary from URL.
Loads a controlled vocabulary from either GitHub or the CEDA Vocab Server and
saves it in the class' _vocabs attribute. Vocabulary should start with
"__URL__" instead of "https://".
Args:
vocab_id: URL of vocabulary to load.
"""
# Loads a specific vocabulary from a URL
vocab_id_url = vocab_id.replace("__URL__", "https://")
if (
vocab_id_url.startswith("https://raw.githubusercontent.com")
):
vocab_list=self._load_from_url_github(vocab_id_url)
elif (
vocab_id_url.startswith("https://vocab.ceda.ac.uk")
):
vocab_list=self._load_from_url_esacci(vocab_id_url)
else:
print(f"Vocabulary url provided is not recognised: {vocab_id_url}")
self._vocabs[vocab_id] = vocab_list
def _get_url(self, url: str) -> requests.Response:
"""GET a URL, retrying on timeout or HTTP 429 error.
Args:
url: URL to GET.
Returns:
Response from the GET request.
"""
try:
res = requests.get(url)
if res.status_code == 429:
time.sleep(10)
res = self._get_url(url)
except TimeoutError:
time.sleep(10)
res = self._get_url(url)
except:
raise
return res
def __getitem__(self, vocab_id: str) -> Union[Dict[str, Any], List[str]]:
"""Enable dictionary access to individual vocabulary items.
Access vocabularies as keys of the class. Loads vocabulary if not already
loaded, and returns the vocabulary.
Args:
vocab_id: Vocabulary to get.
Returns:
Vocabulary as dictionary or vocabulary items as list.
"""
if vocab_id not in self._vocabs:
if vocab_id.startswith("__URL__"):
self._load_from_url(vocab_id)
else:
self._load(vocab_id)
return self._vocabs[vocab_id]
[docs] def lookup(
self,
vocab_lookup: str,
) -> Union[Dict[str, Any], List[str], str, int, float]:
"""Nested dictionary-style look-up for value(s) in a vocabulary.
Iterates through a vocabulary to find the value(s) to that are required for the
check. The string "__all__" can be used once within the vocab_lookup. If
"__all__" is the last key in the lookup, this will return a list of all the
keys at that stage in the vocabulary file. If it comes before, e.g.
"__all__:type", then it will return the value of "type" from every dictionary
at the "__all__" level in the vocabulary.
Args:
vocab_lookup: String that states which vocabulary to use and what value(s)
within the vocabulary to use. Should be of format
"path/to/vocab_id:keys:in:vocab:file".
Returns:
Value, list of values, or dictionary of data from vocabulary.
"""
obj = self
vocab_lookup = re.sub(f"^{vocabs_prefix}:", "", vocab_lookup)
for i, key in enumerate(vocab_lookup.split(":")):
if i == 0:
obj = obj[key]
elif isinstance(obj, dict):
if key in WILDCARD:
if i + 1 != len(vocab_lookup.split(":")):
obj = [obj[key] for key in obj.keys()]
else:
# WILDCARD used as last option, just get keys
obj = list(obj.keys())
else:
obj = obj[key]
else:
if not isinstance(obj, list):
# sanity check
raise ValueError(f"Confused how we got here, obj = {obj}")
elif key in WILDCARD:
raise ValueError(
f"Second WILDCARD ({WILDCARD}) in query {vocab_lookup} not allowed"
)
else:
# obj should be list of dicts, creating list of values or dicts
obj = [d[key] for d in obj]
return obj
[docs] def check(
self,
vocab_lookup: Union[str, List[Union[str, int, float]]],
value: Any,
label: str = "",
lookup: bool = True,
spec_verb: bool = False,
):
"""Checks value or values against value or values in vocabulary.
Checks whether a given value (or values) matches the value or values at a given
location in a controlled vocabulary. Controlled vocabulary is a JSON file
either within the vocabs directory, or at a given URL. For vocabulary files in
the vocab directory, the vocab_lookup should start
"__vocabs__:path/to/file:..." (NOTE without the ".json" extension), and
vocabularies accessed by a URL should start
"__URL__:www.website.com/vocab_file.json:..." (NOTE without the "https://" at
the start, but with the ".json" extension). Each key within the vocabulary file
that leads to the value(s) required should follow and be separated by a colon.
vocab_lookup could also be a list of values to check directly against - in this
case, set `lookup` to False.
Returns list of error messages for values not found in vocabulary.
Args:
vocab_lookup: Vocabulary to use and path to value(s) in vocabulary.
value: Value(s) to check against vocabulary.
label: Text to prepend to error messages.
lookup: Find vocabulary from file or URL (True, default), or use
`vocab_lookup` as the vocabulary to use (False).
spec_verb: Print information about vocab check.
Returns:
List of messages where value(s) can not be found in vocabulary.
"""
errors = []
options = [self.lookup(vocab_lookup) if lookup else vocab_lookup][0]
if spec_verb:
print(f"Vocab lookup: {vocab_lookup}")
if isinstance(options, list):
if value not in options:
errors.append(
f"{label} '{value}' not in vocab options: {options} (using: '{vocab_lookup}')"
)
else:
if spec_verb:
print(f"Value: {value} is in list {options}")
elif isinstance(options, dict):
for key in options.keys():
if key in value.keys():
errors.extend(
self.check(
options[key],
value[key],
label=f"{label}:{key}",
lookup=False,
)
)
else:
errors.append(f"{label} does not have attribute '{key}'")
elif value != options:
errors.append(
f"{label} '{value}' does not equal required vocab value: '{options}' (using: '{vocab_lookup}')"
)
return errors
vocabs = Vocabs()