Source code for pytolnet

__all__ = ['TOLNetAPI']
__doc__ = """TOLNet API
==========

Utilities for retrieving and plotting TOLNet data.

To Install
----------

.. code-block:: bash

    python -m pip install --user git+https://github.com/barronh/pytolnet.git

Example
-------

.. code-block:: python

    import pytolnet
    api = pytolnet.TOLNetAPI()
    cldf = api.data_calendar('UAH')
    newest_data_id = cldf.index.values[0]
    ds = api.to_dataset(newest_data_id)
    print(ds.to_dataframe().reset_index().describe())
    #                                 time      altitude  derived_ozone
    # count                         174096  174096.00000   63538.000000
    # mean   2023-08-16 19:17:36.874643968       7.72500      48.525455
    # min              2023-08-16 13:06:59       0.30000       0.015444
    # 25%              2023-08-16 16:10:39       4.01250      40.799999
    # 50%              2023-08-16 19:18:37       7.72500      47.500000
    # 75%              2023-08-16 22:24:22      11.43750      55.299999
    # max              2023-08-17 01:31:57      15.15000     100.000000
    # std                              NaN       4.29549      13.209246
"""
__version__ = '0.1.1'

changelog = """
v0.1.0 : First release. Includes fix for boolean properties.
"""


[docs]class TOLNetAPI:
    def __init__(
        self, token='anonymous', cache='.',
        root='https://tolnet.larc.nasa.gov/api'
    ):
        """
        Arguments
        ---------
        token : str
            token for API if using non-anonymous access.
        root : str
            Path to TOLNet API
        Returns
        -------
        api : TOLNetAPI
            Object for accessing TOLNetAPI

        Example
        -------

        .. code-block:: python

            import pytolnet
            api = pytolnet.TOLNetAPI()
            cldf = api.data_calendar('UAH')
            newest_data_id = cldf.index.values[0]
            ds = api.to_dataset(newest_data_id)
            print(ds.to_dataframe().reset_index().describe())
            #                                 time      altitude  derived_ozone
            # count                         174096  174096.00000   63538.000000
            # mean   2023-08-16 19:17:36.874643968       7.72500      48.525455
            # min              2023-08-16 13:06:59       0.30000       0.015444
            # 25%              2023-08-16 16:10:39       4.01250      40.799999
            # 50%              2023-08-16 19:18:37       7.72500      47.500000
            # 75%              2023-08-16 22:24:22      11.43750      55.299999
            # max              2023-08-17 01:31:57      15.15000     100.000000
            # std                              NaN       4.29549      13.209246
        """
        import requests
        self._session = requests.Session()
        self._root = root
        self.set_token(token)
        self._instrument_groups_df = None
        self._file_types_df = None
        self._product_types_df = None
        self._processing_types_df = None
        self._cache = cache

[docs]    def set_token(self, token=None):
        """
        Arguments
        ---------
        token : str
            Token to use for access. Use 'anonymous' if you don't have one.
            Use None if you want to be prompted
        """
        import getpass
        if token is None:
            prompt = (
                'Enter token for authorized access or anonymous if you do not'
                + ' have a token\nEnter token:'
            )
            token = getpass.getpass(prompt)

        self._token = token
        self._headers = {"Authorization": f"Bearer {token}"}

    def _get_meta(self, key):
        """
        Simple wrapper to open and return a dataframe
        """
        import pandas as pd
        root = self._root
        headers = self._headers
        s = self._session
        r = s.get(f'{root}/{key}', headers=headers)
        j = r.json()
        if 'status' in j and 'message' in j and 'id' not in j:
            raise IOError('Status {status}: {message}'.format(**j))
        df = pd.DataFrame.from_records(j, index='id').sort_index()
        return df

[docs]    def instruments_groups(self):
        """
        Returns
        -------
        igdf : pandas.DataFrame
            Instrument groups dataframe
        """
        if self._instrument_groups_df is None:
            self._instrument_groups_df = self._get_meta('instruments/groups')
        return self._instrument_groups_df

[docs]    def product_types(self):
        """
        Returns
        -------
        prdf : pandas.DataFrame
            Product types dataframe
        """
        if self._product_types_df is None:
            self._product_types_df = self._get_meta('data/product_types')
        return self._product_types_df

[docs]    def file_types(self):
        """
        Returns
        -------
        fldf : pandas.DataFrame
            File types dataframe
        """
        if self._file_types_df is None:
            self._file_types_df = self._get_meta('data/file_types')
        return self._file_types_df

[docs]    def processing_types(self):
        """
        Returns
        -------
        ptdf : pandas.DataFrame
            Processing types dataframe
        """
        if self._processing_types_df is None:
            self._processing_types_df = self._get_meta('data/processing_types')
        return self._processing_types_df

[docs]    def data_calendar(
        self, igname=None, igid=None, product_type='4', processing_type='1,2',
        file_type='1', ascending=False
    ):
        """
        Retrieve a data calendar.

        Arguments
        ---------
        igname : str or None
            Instruments Group name (see instruments_group)
        igid : int or None
            Instruments Group id (see instruments_group); supersedes igname.
            If igname and igid are None, returns calendar from all instruments
        product_type : int or str
            Defaults to 4 (HIRES), which is the supported data to be read.
            Other formats (5=CALVAL; 6=CLIM) are not tested. Remaining formats
            (7=gridded; 8=legacy) not likely to work.
        processing_type : int or str
            Defaults to '1,2' (central,inhouse). Unprocessed (3) is not yet
            supported.
        file_type : int or str
            Defaults to '1' (HDF GEOMS). See file_types for other options.

        Returns
        -------
        caldf : pandas.DataFrame
            DataFrame of data by date

        Example
        -------

        .. code-block:: python

            import pytolnet
            api = pytolnet.TOLNetAPI()
            cldf = api.data_calendar('UAH')
            print(cldf.columns)
            # 'start_data_date', 'public', 'near_real_time', 'isAccessible'

        """
        from warnings import warn
        import pandas as pd

        if igid is None:
            igdf = self.instruments_groups()
            if igname is None:
                cldfs = []
                opts = dict(
                    product_type=product_type, processing_type=processing_type,
                    file_type=file_type, ascending=ascending
                )
                for igid, row in igdf.iterrows():
                    try:
                        cldfs.append(self.data_calendar(igid=igid, **opts))
                    except Exception as e:
                        instrument_group_name = row['instrument_group_name']
                        msg = f'igid={igid} failed ({instrument_group_name})'
                        msg += f'; {e}'
                        warn(msg)
                return pd.concat(cldfs)
            else:
                sigdf = igdf.query(f'instrument_group_name == "{igname}"')
                if sigdf.shape[0] == 0:
                    ignames = igdf['instrument_group_name'].unique()
                    raise KeyError(f'igname not in {ignames}; got {igname}')
                igids = sigdf.index.values
                igid = igids[0]
                if sigdf.shape[0] > 1:
                    warn(f'igname is not unique {igids}; defaulting to {igid}')

        cldf = self._get_meta(
            f'data/calendar?instrument_group={igid}'
            + f'&product_type={product_type}&processing_type={processing_type}'
            + f'&file_type={file_type}'
        )
        return cldf.sort_values('start_data_date', ascending=ascending)

[docs]    def to_dataset(self, id, cache=None, overwrite=False, product_type=4):
        """
        Acquire data from product_type and return it as an xarray.Dataset

        Arguments
        ---------
        id : int
            Must come from data with product_type=4
        cache : str
            Path to keep cahed files
        overwrite : bool
            If False (default), use cached files in cache folder.
            If True, remake all files
        product_type : int
            Currently supports 4, 5 and 6 (all same)

        Returns
        -------
        ds : xarray.Dataset
            Dataset for file requested

        Example
        -------

        .. code-block:: python

            import pytolnet
            api = pytolnet.TOLNetAPI()
            ds = api.to_dataset(2115)

        """
        opts = dict(id=id, cache=cache, overwrite=overwrite)
        if product_type == 4:
            ds = self.get_product_type4(**opts)
        elif product_type == 5:
            ds = self.get_product_type5(**opts)
        elif product_type == 6:
            ds = self.get_product_type6(**opts)
        else:
            raise IOError(f'Only supports product_type=4, got {product_type}')
        return ds

[docs]    def get_product_type6(self, id, cache=None, overwrite=False):
        """
        Product type 6 has the same format as 4, so this is a thin wrapper.

        Same as to_dataset(..., product_type=5)
        """
        opts = dict(id=id, cache=cache, overwrite=overwrite)
        return self.get_product_type4(**opts)

[docs]    def get_product_type5(self, id, cache=None, overwrite=False):
        """
        Product type 5 has the same format as 4, so this is a thin wrapper.

        Same as to_dataset(..., product_type=5)
        """
        opts = dict(id=id, cache=cache, overwrite=overwrite)
        return self.get_product_type4(**opts)

[docs]    def get_product_type4(self, id, cache=None, overwrite=False):
        """
        Acquire data from product_type=4 and return it as an xarray.Dataset
        Same as to_dataset(..., product_type=4)

        Arguments
        ---------
        id : int
            Must come from data with product_type=4
        cache : str
            Path to keep cahed files
        overwrite : bool
            If False (default), use cached files in cache folder.
            If True, remake all files

        Returns
        -------
        ds : xarray.Dataset
            Dataset for file requested
        """
        import numpy as np
        import pandas as pd
        import xarray as xr
        import os

        root = self._root
        headers = self._headers
        s = self._session
        if cache is None:
            cache = self._cache

        outpath = f'{cache}/{id}.nc'
        if not os.path.exists(outpath) or overwrite:
            r = s.get(f'{root}/data/json/{id}', headers=headers)
            j = r.json()
            altattrs = j['altitude']['attributes']
            altattrs = {k[4:].lower(): v for k, v in altattrs.items()}
            altdata = np.array(j['altitude']['data'])
            alt = xr.DataArray(
                altdata, name='altitude', dims=('altitude'), attrs=altattrs
            )
            timeattrs = j['datetime']['attributes']
            timeattrs = {k[4:].lower(): v for k, v in timeattrs.items()}
            timeattrs['units'] = 'seconds since 1970-01-01 00:00:00+0000'
            timedata = (
                pd.to_datetime(j['datetime']['data'])
                - pd.to_datetime('1970-01-01 00:00:00+0000')
            ).total_seconds()
            time = xr.DataArray(
                timedata, name='time', dims=('time'), attrs=timeattrs
            )
            varattrs = j['value']['attributes']
            varattrs = {k[4:].lower(): v for k, v in varattrs.items()}
            vardata = np.array(j['value']['data']).astype('f')
            vardata = np.ma.masked_values(vardata, varattrs['fill_value'])
            vardata = np.ma.masked_greater(vardata, varattrs['valid_max'])
            vardata = np.ma.masked_less(vardata, varattrs['valid_min'])
            var = xr.DataArray(
                vardata, dims=('time', 'altitude'),
                name='derived_ozone',
                coords={'time': time, 'altitude': alt},
                attrs=varattrs
            )
            fattrs = {k: v for k, v in j['attributes'].items()}
            fattrs.update({
                k: v for k, v in j.items()
                if k not in (
                    'altitude', 'datetime', 'value', 'attributes', 'fileInfo'
                )
            })
            fileinfo = {
                k: v if not isinstance(v, bool) else int(v)
                for k, v in j['fileInfo'].items()
            }
            fattrs.update(fileinfo)
            vds = xr.Dataset(data_vars={'derived_ozone': var}, attrs=fattrs)
            vds.to_netcdf(outpath)
        ds = xr.open_dataset(outpath)
        return ds