Source code for pyrsig.xdr

__all__ = ['from_xdrfile', 'from_xdr']


def getgridprops(gdnam, crshdr, crsparts, projparts, gridparts, vgparts):
    import numpy as np
    vgprops = {}
    vgprops['VGTYP'] = np.int32(vgparts[0])
    vgprops['VGTOP'] = np.float32(vgparts[1])
    vgprops['VGLVLS'] = np.array(vgparts[2:], dtype='f')
    gridkeys = ['NCOLS', 'NROWS', 'XORIG', 'YORIG', 'XCELL', 'YCELL']
    gridprops = dict(zip(gridkeys, gridparts))
    projattrs = {'GDNAM': gdnam}
    projattrs.update(gridprops)
    crskeys = crshdr.split(': ')[-1].split()
    crsprops = dict(zip(crskeys, crsparts))
    if 'stereographic' in crshdr:
        projattrs['GDTYP'] = 6
        projattrs['P_ALP'] = crsprops['lat_0'] / 90
        projattrs['XCENT'] = projattrs['P_BET'] = crsprops['lon_0']
        projattrs['YCENT'] = crsprops['lat_0']
        projattrs['P_GAM'] = crsprops['lat_sec']
    elif 'lcc' in crshdr:
        projattrs['GDTYP'] = 2
        projattrs['P_ALP'] = crsprops['lat_1']
        projattrs['P_BET'] = crsprops['lat_2']
        projattrs['P_GAM'] = crsprops['lon_0']
        projattrs['XCENT'] = crsprops['lon_0']
        projattrs['YCENT'] = crsprops['lat_0']
    elif 'lonlat' in crshdr:
        projattrs['GDTYP'] = 1
    else:
        raise KeyError(f'Need implement {crshdr}')
        # projattrs['GDTYP'] = 7 # merc
        # projattrs['GDTYP'] = 1 # lonlat
    return {'vertical': vgprops, 'projection': projattrs, 'crs': crsprops}


[docs]def from_xdrfile(
    path, na_values=None, decompress=None, as_dataframe=True,
    decompress_inline=True
):
    """
    Currently supports profile, site and swath (v2.0). Each is in XDR format
    with a custom set of header rows in text format. The text header rows also
    describe the binary portion of the file.

    Arguments
    ---------
    path : str
        Path to file in XDR format with RSIG headers
    decompress : bool
        If None, use decompress if path ends in .gz
        If True, decompress to temporary file.
        If False, do not decompress to temporary file (was never compressed)
    decompress_inline : bool
        if True (default), use gzip.open to decompress and read file
        if False, decompress file on disk
    as_dataframe : bool
        If True (default), return data as a pandas.Dataframe.
        If False, return a xarray.Dataset. Only subset and grid support
        as_dataframe.

    Returns
    -------
    df : pd.DataFrame
        Dataframe with XDR content
    """
    import gzip
    import os
    import pandas as pd

    if decompress is None:
        decompress = path.endswith('.gz')
    fsize = os.stat(path).st_size
    if fsize == 20:
        return pd.DataFrame()
    elif fsize < 20:
        raise IOError(
            'File size less than 20-bytes; likely failed to download.'
        )
    if decompress:
        if decompress_inline:
            inf = gzip.open(path)
        else:
            temppath = path.replace('.gz', '')
            assert temppath != path
            if not os.path.exists(temppath):
                with gzip.open(path, 'rb') as inf:
                    with open(temppath, 'wb') as outf:
                        outf.write(inf.read())
            inf = open(temppath, 'rb')
    else:
        inf = open(path, 'rb')
    outf = from_xdr(
        inf, decompress=decompress, na_values=na_values,
        as_dataframe=as_dataframe
    )
    inf.close()
    return outf


[docs]def from_xdr(inf, na_values=None, decompress=False, as_dataframe=True):
    """
    Currently supports profile, site and swath (v2.0). Each is in XDR format
    with a custom set of header rows in text format. The text header rows also
    describe the binary portion of the file.

    Infers RSIG format using first 40 characters.

    * Site 2.0: from_site
    * Profile 2.0: from_profile
    * Swath 2.0: from_swath
    * Point 1.0: from_point
    * CALIPSO 1.0: from_calipso
    * Polygon 1.0: from_polygon
    * Grid 1.0: from_grid
    * Subset 9.0: from_subset

    Arguments
    ---------
    inf : file
        Data file in XDR format with RSIG headers
    na_values : scalar
        Used to remove known missing values.
    decompress : bool
        If True, decompress to temporary file.
        If False, do not decompress to temporary file (was never compressed)
    as_dataframe : bool
        If True (default), return data as a pandas.Dataframe.
        If False, return a xarray.Dataset. Only subset and grid support
        as_dataframe.

    Returns
    -------
    df : pd.DataFrame
        Dataframe with XDR content
    """
    import numpy as np

    inf.seek(0, 0)
    defspec = inf.read(40).decode().split('\n')[0].lower().strip()
    inf.seek(0, 0)
    if defspec.startswith('profile'):
        df = from_profile(inf)
    elif defspec.startswith('site'):
        df = from_site(inf)
    elif defspec.startswith('point'):
        df = from_point(inf)
    elif defspec.startswith('swath'):
        df = from_swath(inf)
    elif defspec.startswith('calipso'):
        df = from_calipso(inf)
    elif defspec.startswith('polygon'):
        df = from_polygon(inf)
    elif defspec.startswith('grid'):
        df = from_grid(inf, as_dataframe=as_dataframe)
    elif defspec.startswith('subset'):
        df = from_subset(inf, as_dataframe=as_dataframe)
    elif defspec.startswith('regridded-swath'):
        df = from_regriddedswath(inf)
    else:
        raise IOError(
            f'{defspec} not in supported formats (profile, site, swath,'
            ' calipso, polygon, grid, subset, or regridded-swath)'
        )

    if na_values is not None:
        df.replace(na_values, np.nan, inplace=True)

    return df


def from_polygon(bf):
    """
    Currently supports Polygon (v1.0) which has 10 header rows in text format.
    The text header rows also describe the binary portion of the file, which
    includes three segments of data corresponding to shx, shp, and dbf files
    embeded within the file.

    Arguments
    ---------
    bf : file
        File opened in byte read in XDR format with RSIG headers

    Returns
    -------
    df : pd.DataFrame
        Dataframe with XDR content

    Notes
    -----
    xdrdump master_hms_smoke_2018-07-01_to_2018-07-02.xdr|m
    Polygon 1.0
    https://www.ospo.noaa.gov/Products/land/hms.html,hmsserver
    2018-07-01T00:00:00-0000 2018-07-02T23:59:59-0000
    # Variable names:
    smoke
    # Variable units:
    ug/m3
    # name shxdatabytes shpdatabytes dbfdatabytes and
    # shxfiledata shpfiledata dbffiledata:
    hms_smoke 828 30812 2468
    """
    import tempfile
    import geopandas as gpd

    for i in range(10):
        _l = bf.readline().decode().strip()
        if i == 0:
            assert (_l.lower() == 'polygon 1.0')
        if i == 2:
            dates, datee = _l.replace(':', '').replace('-0000', 'Z').split()
        elif i == 9:
            prefix, shxn, shpn, dbfn = _l.split()
            shxn = int(shxn)
            shpn = int(shpn)
            dbfn = int(dbfn)
    shxs = bf.tell()
    shxe = shxs + shxn
    shps = shxe
    shpe = shps + shpn
    dbfs = shpe
    dbfe = dbfs + dbfn
    with tempfile.TemporaryDirectory() as td:
        filestem = f'{td}/{prefix}_{dates}_{datee}'
        bf.seek(shxs, 0)
        with open(filestem + '.shx', 'wb') as shxf:
            shxf.write(bf.read(shxe - shxs))
        with open(filestem + '.shp', 'wb') as shpf:
            shpf.write(bf.read(shpe - shps))
        with open(filestem + '.dbf', 'wb') as dbff:
            dbff.write(bf.read(dbfe - dbfs))
        outdf = gpd.read_file(filestem + '.shp')

    outdf.crs = 4326
    return outdf


def from_profile(bf):
    """
    Currently supports Profile (v2.0) which has 14 header rows in text format.
    The text header rows also describe the binary portion of the file.

    Arguments
    ---------
    bf : file
        File opened in byte read in XDR format with RSIG headers

    Returns
    -------
    df : pd.DataFrame
        Dataframe with XDR content

    Notes
    -----
    # Line 1: definition spec
    # Line 2: Info
    # Line 3: Start/End date
    # Line 4-5: bbox
    # Line 6-7: Dimensions
    # Line 8-9: Variable names
    # Line 10-11: Units
    # Line 12-14: definition of data chunks lines
    #   Next nprof * 80 characters are notes for each profile
    #   Next nprof * 8 are N_p number of points for each profile as long int
    #   Next N * sum(N_p for p in nprof) * 8 are data as 64-bit reals

    Example Header
    --------------
    Profile 2.0
    http://www.esrl.noaa.gov/gmd/grad/neubrew/,NEUBrewSubset
    2010-07-10T00:00:00-0000 2010-07-10T23:59:59-0000
    # Subset domain: <min_lon> <min_lat> <max_lon> <max_lat>:
    -126 24 -66 50
    # Dimensions: variables profiles:
    6 5
    # Variable names:
    timestamp id longitude latitude elevation ozone
    # Variable units:
    yyyymmddhhmmss - deg deg m molecules/cm3
    # char notes[profiles][80] and
    # MSB 64-bit integers points[profiles] and
    # IEEE-754 64-bit reals data_1[variables][points_1]
       ... data_P[variables][points_P]:
    """
    import numpy as np
    import pandas as pd

    for i in range(14):
        _l = bf.readline()
        if i == 0:
            assert (_l.decode().strip().lower() == 'profile 2.0')
        elif i == 6:
            nvar, nprof = np.array(_l.decode().strip().split(), dtype='i')
        elif i == 8:
            varkeys = _l.decode().strip().split()
        elif i == 10:
            units = _l.decode().strip().split()

    notes = [bf.read(80).decode().strip() for i in range(nprof)]
    longnp = np.frombuffer(bf.read(nprof * 8), dtype='>i8')
    # Read all values at once
    dfs = []
    varwunits = [varkey + f'({units[i]})' for i, varkey in enumerate(varkeys)]
    for i, npoints in enumerate(longnp):
        nfloats = npoints * nvar
        vals = np.frombuffer(bf.read(nfloats * 8), dtype='>d').reshape(
            nvar, npoints
        )
        vals = vals.byteswap().view(vals.dtype.newbyteorder())
        df = pd.DataFrame(dict(zip(varwunits, vals)))
        df['NOTE'] = notes[i]
        dfs.append(df)

    df = pd.concat(dfs)
    infmt = '%Y%m%d%H%M%S'
    outfmt = '%Y-%m-%dT%H:%M:%S%z'
    ntimestamps = df['timestamp(yyyymmddhhmmss)']
    timestamps = pd.to_datetime(
        ntimestamps, format=infmt, utc=True
    ).dt.strftime(outfmt)
    df.drop('timestamp(yyyymmddhhmmss)', axis=1, inplace=True)
    df['Timestamp(UTC)'] = timestamps
    renames = {
        'id(-)': 'STATION(-)',
        'longitude(deg)': 'LONGITUDE(deg)',
        'latitude(deg)': 'LATITUDE(deg)',
        'elevation(m)': 'ELEVATION(m)'
    }
    renames = {k: v for k, v in renames.items() if k in df.columns}
    df.rename(columns=renames, inplace=True)
    frontkeys = [
        'Timestamp(UTC)', 'LONGITUDE(deg)', 'LATITUDE(deg)', 'ELEVATION(m)',
        'STATION(-)'
    ]
    lastkeys = ['NOTE']
    outkeys = frontkeys + [
        k for k in df.columns if k not in (frontkeys + lastkeys)
    ] + lastkeys
    df = df[outkeys]

    return df


def from_swath(bf):
    """
    Currently supports Swath (v2.0) which has 14 header rows in text format.
    The text header rows also describe the binary portion of the file.

    Arguments
    ---------
    bf : file
        File opened in byte read in XDR format with RSIG headers

    Returns
    -------
    df : pd.DataFrame
        Dataframe with XDR content

    Notes
    -----
    # 14 header rows to be read using \n
    # Line 1: definition spec
    # Line 2: Info
    # Line 3: Start/End date
    # Line 4-5: Dimensions
    # Line 6-7: Variable names
    # Line 8-9: Units
    # Line 10-11: bbox
    # Line 12-14: definition of data chunks lines
    #   Next nsite * 4 are number of times for each site (N_s) as 32-bit int
    #   Next nvar * sum(2 for s in nsite) * 4 are data as 64-bit reals
    #   Next nvar * sum(N_s for s in nsite) * 4 are data as 64-bit reals

    Swath 2.0
    http://www.star.nesdis.noaa.gov/smcd/emb/viirs_aerosol/,VIIRSSubset
    2013-06-15T00:00:00-0000
    # Dimensions: variables timesteps scans:
    12 24 4
    # Variable names:
    Longitude Latitude Scan_Start_Time faot550 Longitude_SW Longitude_SE
       ... Longitude_NW Longitude_NE Latitude_SW Latitude_SE Latitude_NW
       ... Latitude_NE
    # Variable units:
    deg deg YYYYDDDHHMM - deg deg deg deg deg deg deg deg
    # Domain: <min_lon> <min_lat> <max_lon> <max_lat>
    -113 26 -111 26.5
    # MSB 64-bit integers (yyyydddhhmm) timestamps[scans] and
    # MSB 64-bit integers points[scans] and
    # IEEE-754 64-bit reals data_1[variables][points_1]
       ... data_S[variables][points_S]:
    """

    import numpy as np
    import pandas as pd

    for i in range(14):
        _l = bf.readline()
        if i == 0:
            defspec = _l.decode().strip().lower()
        elif i == 2:
            pass  # stime = _l.decode().strip()
        elif i == 4:
            nvar, nt, nscan = np.array(_l.decode().strip().split(), dtype='i')
        elif i == 6:
            varkeys = _l.decode().strip().split()
        elif i == 8:
            units = _l.decode().strip().split()

    assert (defspec == defspec)

    # MSB 64-bit integers (yyyydddhhmm) timestamps[scans]
    nts = np.frombuffer(bf.read(nscan * 8), dtype='>i8')
    # MSB 64-bit integers points[scans]
    nps = np.frombuffer(bf.read(nscan * 8), dtype='>i8')
    infmt = '%Y%j%H%M'
    outfmt = '%Y-%m-%dT%H:%M:%S%z'
    timestamps = pd.to_datetime(nts, format=infmt, utc=True).strftime(outfmt)

    # Read all values at once
    # IEEE-754 64-bit reals data_1[variables][points_1] ...
    #           data_S[variables][points_S]
    dfs = []
    varwunits = [varkey + f'({units[i]})' for i, varkey in enumerate(varkeys)]
    # To-do
    # Switch from iterative unpacking to numpy.frombuffer, which is much faster
    # this requires some fancy indexing and complex repeats.
    for i, npoints in enumerate(nps):
        nfloats = npoints * nvar
        vals = np.frombuffer(bf.read(nfloats * 8), dtype='>d').reshape(
            nvar, npoints
        )
        vals = vals.byteswap().view(vals.dtype.newbyteorder())
        df = pd.DataFrame(dict(zip(varwunits, vals)))
        df['Timestamp(UTC)'] = timestamps[i]
        dfs.append(df)

    df = pd.concat(dfs)
    renames = {
        'Longitude(deg)': 'LONGITUDE(deg)',
        'Latitude(deg)': 'LATITUDE(deg)',
    }
    renames = {k: v for k, v in renames.items() if k in df.columns}
    df.rename(columns=renames, inplace=True)
    outkeys = ['Timestamp(UTC)', 'LONGITUDE(deg)', 'LATITUDE(deg)']
    outkeys = outkeys + [k for k in df if k not in outkeys]
    df = df[outkeys]
    return df


def from_site(bf):
    """
    Currently supports Site (v2.0) which has 14 header rows in text format.
    The text header rows also describe the binary portion of the file.

    Arguments
    ---------
    bf : file
        File opened in byte read in XDR format with RSIG headers

    Returns
    -------
    df : pd.DataFrame
        Dataframe with XDR content

    Notes
    -----
    # 13 header rows to be read using \n
    # Line 1: definition spec
    # Line 2: Info
    # Line 3: Start/End date
    # Line 4-5: Dimensions
    # Line 6-7: Variable names
    # Line 8-9: Units
    # Line 10-13: definition of data chunks lines
    #   Next nsite * 80 characters are notes for each profile
    #   Next nsite * 4 are IDs number of times for each site as 32-bit integers
    #     or 64-bit integers.
    #   Next sum(2 for s in nsite) * 4 are data as 32-bit reals
    #     or 64-bit integers.
    #   Next sum(N_s for s in nsite) * 4 are data as 32-bit reals
    #     or 64-bit integers.

    Example Header
    --------------
    SITE 2.0
    http://airnow.gov/,reformat_airnow_obs,SiteSubset
    2005-08-26T00:00:00-0000
    # data dimensions: timesteps stations
    3 9
    # Variable names:
    PM25
    # Variable units:
    ug/m3
    # char notes[stations][80] and
    # MSB 64-bit integers ids[stations] and
    # IEEE-754 64-bit reals sites[stations][2=<longitude,latitude>] and
    # IEEE-754 64-bit reals data[timesteps][stations]:

    """
    import numpy as np
    import pandas as pd

    for i in range(13):
        _l = bf.readline()
        if i == 0:
            assert (_l.decode().strip().lower() == 'site 2.0')
        elif i == 2:
            stime = _l.decode().strip()
        elif i == 4:
            nt, nsite = np.array(_l.decode().strip().split(), dtype='i')
        elif i == 6:
            varkeys = _l.decode().strip().split()
        elif i == 8:
            units = _l.decode().strip().split()
        elif i == 10:
            idtype = _l.decode().strip()
        elif i == 11:
            xytype = _l.decode().strip()
        elif i == 12:
            valtype = _l.decode().strip()

    idtype = np.dtype({True: '>i8', False: '>i4'}['64' in idtype])
    xytype = np.dtype({True: '>f8', False: '>f4'}['64' in xytype])
    valtype = np.dtype({True: '>f8', False: '>f4'}['64' in valtype])

    time = pd.date_range(stime, periods=nt, freq='h')
    notes = [bf.read(80).decode().strip() for i in range(nsite)]

    buff = bf.read(nsite * idtype.itemsize)
    nis = np.frombuffer(buff, dtype=idtype)
    buff = bf.read(nsite * xytype.itemsize * 2)
    xys = np.frombuffer(buff, dtype=xytype).reshape(nsite, 2)

    # Read all values at once
    varwunits = [varkey + f'({units[i]})' for i, varkey in enumerate(varkeys)]

    # Unpacking data using frombuffer, which is much faster than iteratively
    # calling xdr.unpack_farray
    vals = np.frombuffer(bf.read(), dtype=valtype).reshape(1, nsite, nt)
    atimes = np.array(time, ndmin=1)[None, :].repeat(nsite, 0).ravel()
    anotes = np.array(notes)[:, None].repeat(nt, 1).T.ravel()
    astation = nis[:, None].repeat(nt, 1).T.ravel()
    axs = xys[:, 0, None].repeat(nt, 1).T.ravel()
    ays = xys[:, 1, None].repeat(nt, 1).T.ravel()
    data = {
        'Timestamp(UTC)': atimes,
        'SITE_NAME': anotes,
        'STATION(-)': astation.astype(f'={astation.dtype.char}'),
        'LONGITUDE(deg)': axs.astype(f'={axs.dtype.char}'),
        'LATITUDE(deg)': ays.astype(f'={ays.dtype.char}'),
    }
    for vi, varwunit in enumerate(varwunits):
        data[varwunit] = vals[vi].ravel().astype(f'={vals.dtype.char}')
    df = pd.DataFrame(data)

    # Serves as a comment on how this was done iteratively.
    """
    dfs = []
    for i, id in enumerate(nis):
        lon, lat = xys[i]
        vals = np.array(up.unpack_farray(nt, up.unpack_float)).reshape(1, nt)
        #assert np.allclose(valchk[0, i], vals)
        df = pd.DataFrame(dict(zip(varwunits, vals)))
        df['Timestamp(UTC)'] = time
        df['SITE_NAME'] = notes[i]
        df['STATION(-)'] = id
        df['LONGITUDE(deg)'] = lon
        df['LATITUDE(deg)'] = lat
        dfs.append(df)

    df = pd.concat(dfs)
    """
    frontkeys = [
        'Timestamp(UTC)', 'LONGITUDE(deg)', 'LATITUDE(deg)', 'STATION(-)'
    ]
    lastkeys = ['SITE_NAME']
    outkeys = frontkeys + [
        k for k in df.columns if k not in (frontkeys + lastkeys)
    ] + lastkeys
    df = df[outkeys]
    return df


def from_point(bf):
    """
    Currently supports Point (v1.0) which has 12 header rows in text format.
    The text header rows also describe the binary portion of the file.

    Arguments
    ---------
    bf : file
        File opened in byte read in XDR format with RSIG headers

    Returns
    -------
    df : pd.DataFrame
        Dataframe with XDR content

    Notes
    -----
    # 12 header rows to be read using \n
    # Line 1: definition spec
    # Line 2: Info
    # Line 3: Start/End date
    # Line 4-5: Dimensions
    # Line 6-7: Variable names
    # Line 8-9: Units
    # Line 10-12: definition of data chunks lines
    #   Next npoint * 80 characters are notes for each point
    #   Next nvariable * npoint * 8 bytes are data[variables][]

    Example Header
    --------------
    Point 1.0
    https://satepsanone.nesdis.noaa.gov/pub/FIRE/BBEP-geo,GOESBBSubset
    2018-11-09T00:00:00-00002018-11-10T23:00:00-0000
    # Dimensions: variables points:
    4 1876
    # Variable names:
    Timestamp Longitude Latitude PM25_emission
    # Variable units:
    yyyymmddhhmmss deg deg kg
    # IEEE-754 64-bit reals data[variables][points]:
    """
    import numpy as np
    import pandas as pd

    for i in range(11):
        _l = bf.readline()
        if i == 0:
            assert (_l.decode().strip().lower() == 'point 1.0')
        elif i == 2:
            # stime = _l.decode().strip()
            pass  # not loading time because it is duplicative
        elif i == 4:
            nvar, npoint = np.array(_l.decode().strip().split(), dtype='i')
        elif i == 6:
            varkeys = _l.decode().strip().split()
        elif i == 8:
            units = _l.decode().strip().split()

    notes = [bf.read(80).decode().strip() for i in range(npoint)]

    # Use numpy to unpack frombuffer becuase it is faster than unpack
    vals = np.frombuffer(bf.read(), dtype='>d').reshape(nvar, npoint)
    vals = vals.byteswap().view(vals.dtype.newbyteorder())
    varkeys = [
        {'id': 'STATION'}.get(varkey, varkey).upper()
        for varkey in varkeys
    ]
    varwunits = [varkey + f'({units[i]})' for i, varkey in enumerate(varkeys)]
    data = {
        'NOTE': notes,
    }
    for vi, varwunit in enumerate(varwunits):
        data[varwunit] = vals[vi].ravel().astype(f'={vals.dtype.char}')

    df = pd.DataFrame(data)
    infmt = '%Y%m%d%H%M%S'
    outfmt = '%Y-%m-%dT%H:%M:%S%z'
    ntimestamps = df['TIMESTAMP(yyyymmddhhmmss)']
    timestamps = pd.to_datetime(
        ntimestamps, format=infmt, utc=True
    ).dt.strftime(outfmt)
    df.drop('TIMESTAMP(yyyymmddhhmmss)', axis=1, inplace=True)
    ti = varwunits.index('TIMESTAMP(yyyymmddhhmmss)')
    varwunits[ti] = 'Timestamp(UTC)'

    df['Timestamp(UTC)'] = timestamps
    outkeys = varwunits + ['NOTE']
    df = df[outkeys].rename(
        columns={'PM25_ATM_HOURLY(ug/m3)': 'pm25_atm_hourly(ug/m3)'}
    )
    return df


def from_calipso(bf):
    """
    Currently supports Calipso (v1.0) which has 15 header rows in text format.
    The text header rows also describe the binary portion of the file.

    Arguments
    ---------
    bf : file
        File opened in byte read in XDR format with RSIG headers

    Returns
    -------
    df : pd.DataFrame
        Dataframe with XDR content

    Example Header
    --------------
    CALIPSO 1.0
    https://eosweb.larc.nasa.gov/project/calipso/calipso_table,CALIPSOSubset
    2016-05-04T00:00:00-0000
    # Dimensions: variables timesteps profiles:
    5 24 8
    # Variable names: VariableName is Total_Backscatter_Coefficient_532
    Profile_UTC_Time Longitude Latitude Elevation VariableName
    # Variable units:
    yyyymmdd.f deg deg m per_kilometer_per_steradian
    # Domain: <min_lon> <min_lat> <max_lon> <max_lat>
    -126 24 -66 50
    # MSB 64-bit integers (yyyydddhhmm) profile_timestamps[profiles] and
    # IEEE-754 64-bit reals profile_bounds[profiles][2=<lon,lat>][2=<min,max>]
    # MSB 64-bit integers profile_dimensions[profiles][2=<points,levels>] and
    # IEEE-754 64-bit reals profile_data_1[variables][points_1][levels]
    #                       ... profile_data_S[variables][points_S][levels]:
    """
    import numpy as np
    import pandas as pd
    import xarray as xr

    headerlines = []
    for i in range(15):
        _l = bf.readline().decode()
        headerlines.append(_l)
        if i == 0:
            assert (_l.strip().lower() == 'calipso 1.0')
        elif i == 2:
            # stime = _l.strip()
            pass  # not loading time because it is duplicative
        elif i == 4:
            nvar, ntime, nprof = np.array(_l.strip().split(), dtype='i')
        elif i == 6:
            varkeys = _l.strip().split()
        elif i == 8:
            units = _l.strip().split()
        elif i == 10:
            # bbox = [float(d) for d in _l.strip().split()]
            pass  # not loading time because it is duplicative

    stime = bf.tell()
    etime = stime + nprof * 8
    sbnd = etime
    ebnd = sbnd + (nprof * 4) * 8
    sdims = ebnd
    edims = sdims + (nprof * 2) * 8
    sdata = edims
    # time = np.frombuffer(bf.read(etime - stime), dtype='>i8')
    # bshape = [nprof, 2, 2]
    # tbnds = np.frombuffer(bf.read(ebnd - sbnd), dtype='>d').reshape(*bshape)
    bf.seek(sdims, 0)
    dims = np.frombuffer(bf.read(edims - sdims), dtype='>i8').reshape(nprof, 2)
    if (
        not (dims[:, 1] == dims[0, 1]).all()
    ):
        raise IOError(
            'CALIPSO 1.0 xdr reader needs to be updated; levels vary. Use'
            + ' ASCII backend instead of xdr until resolved. Please report to'
            + ' https://github.com/barronh/pyrsig/issues'
        )
    times = []
    lons = []
    lats = []
    elev = []
    data = []
    for iprof, (npoint, nlev) in enumerate(dims):
        edata = sdata + (3 * npoint + 2 * npoint * nlev) * 8
        vals = np.frombuffer(bf.read(edata - sdata), dtype='>d')
        vals = vals.byteswap().view(vals.dtype.newbyteorder())
        sd = 0
        ed = sd + npoint * 3
        ptimes, plons, plats = vals[sd:ed].reshape(3, npoint)
        sd = ed
        ed = sd + npoint * nlev * 2
        pelev, pdata = vals[sd:ed].reshape(2, npoint, nlev)
        times.append(ptimes)
        lons.append(plons)
        lats.append(plats)
        elev.append(pelev)
        data.append(pdata)
        sdata = edata
    ds = xr.Dataset()
    alldata = [times, lons, lats, elev, data]
    for key, vals, unit in zip(varkeys, alldata, units):
        attrs = dict(long_name=key, units=unit)
        dims = {1: ('points',), 2: ('points', 'levels')}[vals[0].ndim]
        ds[key] = (dims, np.concatenate(vals, axis=0), attrs)
    ds.attrs['description'] = '\n'.join(headerlines)
    df = ds.to_dataframe().reset_index(drop=True)
    renamer = {key: f'{key}({unit})' for key, unit in zip(varkeys, units)}
    renamer['Latitude'] = 'LATITUDE(deg)'
    renamer['Longitude'] = 'LONGITUDE(deg)'
    renamer['Elevation'] = 'ELEVATION(m)'
    df['Timestamp(UTC)'] = (
        pd.to_datetime(df['Profile_UTC_Time'] // 1, utc=True)
        + pd.to_timedelta(df['Profile_UTC_Time'] % 1, unit='d')
    ).dt.round('1s')
    df = df[['Timestamp(UTC)'] + varkeys[1:-1] + varkeys[:1] + varkeys[-1:]]
    df.rename(columns=renamer, inplace=True)
    return df


def from_grid(bf, as_dataframe=True):
    """
    Currently supports Grid (v1.0) which has 12 header rows in text format.
    The text header rows also describe the binary portion of the file.

    Arguments
    ---------
    bf : file
        File opened in byte read in XDR format with RSIG headers
    as_dataframe : bool
        If True (default), return data as a pandas.Dataframe.
        If False, return a xarray.Dataset.

    Returns
    -------
    df : pd.DataFrame
        Dataframe with XDR content

    Example Header
    --------------
    xdrdump master_hrrr_wind_2020-02-17.xdr
    Grid 1.0
    http://home.chpc.utah.edu/~u0553130/Brian_Blaylock/,HRRRSubset
    2020-02-17T00:00:00-0000
    # Dimensions: timesteps variables rows columns:
    24 2 85 73
    # Variable names:
    wind_u wind_v
    # Variable units:
    m/s m/s
    # IEEE-754 64-bit reals longitudes[rows][columns] and
    # IEEE-754 64-bit reals latitudes[rows][columns] and
    # IEEE-754 64-bit reals data[timesteps][variables][rows][columns]:
    -7.8518169999999998e+01
    -7.8484610000000004e+01
    -7.8451070000000001e+01
    ...
    """
    import numpy as np
    import pandas as pd
    import xarray as xr

    headerlines = []
    for i in range(12):
        _l = bf.readline().decode().strip()
        headerlines.append(_l)
        if i == 0:
            assert (_l.lower() == 'grid 1.0')
        elif i == 1:
            rsig_program = _l
        elif i == 2:
            stime = pd.to_datetime(_l)
        elif i == 4:
            ntime, nvar, nrow, ncol = np.array(_l.split(), dtype='i')
        elif i == 6:
            varkeys = _l.split()
        elif i == 8:
            units = _l.split()
        elif i == 10:
            # bbox = [float(d) for d in _l.strip().split()]
            pass  # not loading time because it is duplicative

    nbytes = nrow * ncol * 8
    lon = np.frombuffer(bf.read(nbytes), dtype='>d').reshape(nrow, ncol)
    lon = lon.byteswap().view(lon.dtype.newbyteorder())
    lat = np.frombuffer(bf.read(nbytes), dtype='>d').reshape(nrow, ncol)
    lat = lat.byteswap().view(lat.dtype.newbyteorder())
    dshape = ntime, nvar, nrow, ncol
    nbytes = np.prod(dshape) * 8
    data = np.frombuffer(bf.read(nbytes), dtype='>d').reshape(*dshape)
    data = data.byteswap().view(data.dtype.newbyteorder())
    ds = xr.Dataset()
    ds.attrs['rsig_program'] = rsig_program
    dt = pd.to_timedelta('3600s')
    reftime = '1970-01-01T00:00:00+0000'
    attrs = dict(units=f'seconds since {reftime}', long_name='time')
    ds.coords['time'] = ('time',), stime + np.arange(ntime) * dt, attrs
    attrs = dict(units='index', long_name='x_center')
    ds.coords['x'] = ('x',), np.arange(ncol, dtype='d') + 0.5
    attrs = dict(units='index', long_name='y_center')
    ds.coords['y'] = ('y',), np.arange(nrow, dtype='d') + 0.5
    attrs = dict(units='degrees_north', long_name='latitude')
    ds['lat'] = ('y', 'x'), lat, attrs
    attrs = dict(units='degrees_east', long_name='longitude')
    ds['lon'] = ('y', 'x'), lon, attrs
    for vi, varkey in enumerate(varkeys):
        attrs = dict(long_name=varkey, units=units[vi])
        ds[varkey] = ('time', 'y', 'x'), data[:, vi, :, :], attrs

    if as_dataframe:
        df = ds.to_dataframe().astype('d')
        time = df.index.get_level_values('time')
        df['Timestamp(UTC)'] = time.strftime('%Y-%m-%dT%H:%M:%S+0000')
        keepcols = ['Timestamp(UTC)', 'lon', 'lat'] + varkeys
        renamer = {vk: f'{vk}({vu})' for vk, vu in zip(varkeys, units)}
        renamer['lon'] = 'LONGITUDE(deg)'
        renamer['lat'] = 'LATITUDE(deg)'
        df = df[keepcols].rename(columns=renamer)
        return df
    else:
        return ds


def from_subset(bf, as_dataframe=True):
    """
    Currently supports Subset (v9.0) which has 17 header rows in text format.
    The text header rows also describe the binary portion of the file. This
    data can be very large for high resolution continental data. For example,
    CMAQ EQUATES CONUS for one 4D variable requires 4GB of memory as a
    dataframe whereas the same data from CMAQ EQUATES HEMI is just 0.065GB.
    Both are much smaller as a Dataset where coordinate data is not repeated.

    Arguments
    ---------
    bf : file
        File opened in byte read in XDR format with RSIG headers
    as_dataframe : bool
        If True (default), return data as a pandas.Dataframe.
        If False, return a xarray.Dataset.

    Returns
    -------
    df : pd.DataFrame
        Dataframe with XDR content

    Notes
    -----
    /xdrdump master_cmaq_amad_hemi_ext_2006-08-28.xdr |m
    SUBSET 9.0 CMAQ
    108NHEMI1
    http://www.epa.gov/ttn/scram/,CMAQSubset
    2006-08-28T00:00:00-0000
    # data dimensions: timesteps variables layers rows columns:
    1 4 35 187 187
    # subset indices (0-based time, 1-based layer/row/column): first-timestep
      ... last-timestep first-layer last-layer first-row last-row first-column
      ... last-column:
     0 0 1 35 1 187 1 187
    # Variable names:
    LONGITUDE LATITUDE ELEVATION EXT_Recon
    # Variable units:
    deg deg m 1/km
    # stereographic projection: lat_0 lon_0 lat_sec major_semiaxis
      ... minor_semiaxis
    90 -98 45 6.37e+06 6.37e+06
    # Grid: ncols nrows xorig yorig xcell ycell vgtyp vgtop vglvls[36]:
    187 187 -1.0098e+07 -1.0098e+07 108000 108000 7 5000 1 0.9975 0.995 ...
    # IEEE-754 32-bit reals data[variables][timesteps][layers][rows][columns]:
    -1.4300000000000000e+02
    -1.4269029235839844e+02
    ...
    """
    import numpy as np
    import pandas as pd
    import xarray as xr
    from .utils import get_proj4
    import pyproj

    headerlines = []
    for i in range(17):
        _l = bf.readline().decode().strip()
        headerlines.append(_l)
        if i == 0:
            assert (_l.lower() == 'subset 9.0 cmaq')
        elif i == 1:
            gdnam = _l
        elif i == 2:
            rsig_program = _l
        elif i == 3:
            stime = pd.to_datetime(_l)
        elif i == 5:
            ntime, nvar, nlay, nrow, ncol = np.array(_l.split(), dtype='i')
        elif i == 7:
            ftime, ltime, flay, llay, frow, lrow, fcol, lcol = np.array(
                _l.split(), dtype='i'
            )
        elif i == 9:
            varkeys = _l.split()
        elif i == 11:
            units = _l.split()
        elif i == 12:
            crshdr = _l.lower()
        elif i == 13:
            crsparts = np.array(_l.split(), dtype='d')
        elif i == 15:
            projparts = np.array(_l.split(), dtype='d')
            vgparts = np.array(projparts[-nlay - 3:], dtype='f')
            gridparts = projparts[:-len(vgparts)]
        elif i == 16:
            is64 = '64-bit' in _l
            if is64:
                dtval = '>d'
            else:
                dtval = '>f'

    gridprops = getgridprops(
        gdnam, crshdr, crsparts, projparts, gridparts, vgparts
    )
    vgprops = gridprops['vertical']
    vglvls = vgprops['VGLVLS']
    projattrs = gridprops['projection']
    earth_radius = gridprops['crs']['major_semiaxis']

    proj4 = get_proj4(projattrs, earth_radius=earth_radius)
    proj = pyproj.Proj(proj4)
    data = np.frombuffer(bf.read(), dtype=dtval).reshape(
        nvar, ntime, nlay, nrow, ncol
    )
    ds = xr.Dataset()
    ds.attrs.update(**projattrs)
    ds.attrs.update(**vgprops)
    ds.attrs['UPNAM'] = rsig_program.ljust(16)
    dt = pd.to_timedelta('3600s')
    t = stime + np.arange(ntime) * dt
    reftime = '1970-01-01T00:00:00+0000'
    attrs = dict(long_name='TSTEP', units=f'seconds since {reftime}')
    ds.coords['TSTEP'] = t
    z = ((vglvls[1:] + vglvls[:-1]) / 2)[flay - 1:llay]
    attrs = dict(long_name='LAY', units='layer_center')
    ds.coords['LAY'] = ('LAY',), z, attrs
    y = np.arange(nrow) + frow - 0.5
    attrs = dict(long_name='ROW', units='cell_center')
    ds.coords['ROW'] = ('ROW',), y, attrs
    x = np.arange(ncol) + fcol - 0.5
    attrs = dict(long_name='COL', units='cell_center')
    ds.coords['COL'] = ('COL',), x, attrs
    Y, X = xr.broadcast(ds['ROW'], ds['COL'])
    LON, LAT = proj(X.values, Y.values, inverse=True)
    attrs = dict(long_name='LONGITUDE'.ljust(16), units='degrees_east    ')
    attrs['var_desc'] = attrs['long_name'].ljust(80)
    ds['LONGITUDE'] = ('ROW', 'COL'), LON.astype('f'), attrs
    attrs = dict(long_name='LATITUDE'.ljust(16), units='degrees_north   ')
    attrs['var_desc'] = attrs['long_name'].ljust(80)
    ds['LATITUDE'] = ('ROW', 'COL'), LAT.astype('f'), attrs
    dims = ('TSTEP', 'LAY', 'ROW', 'COL')
    for vi, vk in enumerate(varkeys):
        attrs = dict(
            units=units[vi].ljust(16), long_name=vk.ljust(16),
            var_desc=vk.ljust(80)
        )
        vals = data[vi]
        ds[vk] = dims, vals, attrs

    if as_dataframe:
        df = ds.astype(dtval[1:]).to_dataframe()
        time = df.index.get_level_values('TSTEP')
        df['Timestamp(UTC)'] = time.strftime('%Y-%m-%dT%H:%M:%S+0000')
        keepcols = ['Timestamp(UTC)', 'LONGITUDE', 'LATITUDE'] + varkeys
        renamer = {vk: f'{vk}({vu})' for vk, vu in zip(varkeys, units)}
        renamer['LONGITUDE'] = 'LONGITUDE(deg)'
        renamer['LATITUDE'] = 'LATITUDE(deg)'
        df = df[keepcols].rename(columns=renamer)
        return df
    else:
        return ds


def from_regriddedswath(bf):
    """
    Currently supports regridded-swath (v2.0) which has 19 header rows in text
    format. The text header rows also describe the binary portion of the file.
    This data can be very large for high resolution continental data. For
    example, CMAQ EQUATES CONUS for one 4D variable requires 4GB of memory as
    a dataframe whereas the same data from CMAQ EQUATES HEMI is just 0.065GB.
    Both are much smaller as a Dataset where coordinate data is not repeated.

    Arguments
    ---------
    bf : file
        File opened in byte read in XDR format with RSIG headers
    as_dataframe : bool
        If True (default), return data as a pandas.Dataframe.
        If False, return a xarray.Dataset.

    Notes
    -----
    In progress.

    Returns
    -------
    df : pd.DataFrame
        Dataframe with XDR content

    Notes
    -----
    xdrdump master_regridded_viirs_ivaot_faot550_2013-06-15.xdr|m
    REGRIDDED-Swath 2.0
    http://....nesdis.noaa.gov/smcd/emb/viirs_aerosol/,VIIRSSubset,XDRConvert
    2013-06-15T20:00:00-0000
    # timesteps
    2
    # Variable name:
    faot550
    # Variable units:
    -
    # stereographic proj: lat_0 lon_0 lat_sec major_semiaxis minor_semiaxis
    90 -98 45 6370000.000000 6370000.000000
    # Grid: ncols nrows xorig yorig xcell ycell vgtyp vgtop vglvls[2]:
    137 137 -7398000.0 -7398000.0 108000.0 108000.0 2 10000.0 1 0.995
    # MSB 64-bit integers points[timesteps] and
    # IEEE-754 64-bit reals longitudes[timesteps][points] and
    # IEEE-754 64-bit reals latitudes[timesteps][points] and
    # MSB 64-bit integers columns[timesteps][points] and
    # MSB 64-bit integers rows[timesteps][points] and
    # IEEE-754 64-bit reals data[timesteps][points]:
    154
    719
    -1.2970142966950573e+02
    -1.2908750532347997e+02
    -1.2783553910026154e+02
    ...
    """
    import numpy as np
    import pandas as pd
    # from .utils import get_proj4

    headerlines = []
    for i in range(19):
        _l = bf.readline().decode().strip()
        headerlines.append(_l)
        if i == 0:
            assert (_l.lower() == 'regridded-swath 2.0')
        elif i == 1:
            rsig_program = _l
        elif i == 2:
            stime = pd.to_datetime(_l)
        elif i == 4:
            ntime, = np.array(_l.split(), dtype='i')
        elif i == 6:
            varkeys = _l.split()
        elif i == 8:
            units = _l.split()
        elif i == 9:
            crshdr = _l.lower()
        elif i == 10:
            crsparts = np.array(_l.split(), dtype='d')
        elif i == 12:
            projparts = np.array(_l.split(), dtype='d')
            nlay = 1
            vgparts = np.array(projparts[-nlay - 3:], dtype='f')
            gridparts = projparts[:-len(vgparts)]

    dt = pd.to_timedelta('1h')
    times = []
    points = np.frombuffer(bf.read(ntime * 8), dtype='>i8')
    outfmt = '%Y-%m-%dT%H:%M:%S%z'
    dts = pd.Series(np.concatenate([
        np.ones(points[ti]) * ti * dt
        for ti in range(ntime)
    ]))
    times = (stime + dts).dt.strftime(outfmt).values
    npoints = points.sum()
    lons = np.frombuffer(bf.read(npoints * 8), dtype='>d')
    lats = np.frombuffer(bf.read(npoints * 8), dtype='>d')
    cols = np.frombuffer(bf.read(npoints * 8), dtype='>i8')
    rows = np.frombuffer(bf.read(npoints * 8), dtype='>i8')
    vals = np.frombuffer(bf.read(npoints * 8), dtype='>d')
    valkey = f'{varkeys[0]}({units[0]})'
    gdn = 'unknown'  # gdnam
    gattrs = getgridprops(gdn, crshdr, crsparts, projparts, gridparts, vgparts)
    df = pd.DataFrame({
        'Timestep(UTC)': times.astype(f'={times.dtype.char}'),
        'LONGITUDE(deg)': lons.astype(f'={lons.dtype.char}'),
        'LATITUDE(deg)': lats.astype(f'={lats.dtype.char}'),
        'COLUMN(-)': cols.astype(f'={cols.dtype.char}'),
        'ROW(-)': rows.astype(f'={rows.dtype.char}'),
        valkey: vals.astype(f'={vals.dtype.char}'),
    })
    df.attrs.update(gattrs)
    df.attrs['rsig_program'] = rsig_program
    return df


if __name__ == '__main__':
    import pyrsig

    baseurl = (
        'https://ofmpub.epa.gov/rsig/rsigserver?SERVICE=wcs&VERSION=1.0.0'
        + '&REQUEST=GetCoverage&FORMAT=xdr'
    )
    baseurl += '&BBOX=-130.0,20.,-65.,60&COMPRESS=1'
    ckey = 'pandora.L2_rnvs3p1_8.nitrogen_dioxide_vertical_column_amount'
    url = (
        f'{baseurl}&TIME=2023-10-17T13:00:00Z/2023-10-17T13:59:59Z'
        + f'&COVERAGE={ckey}'
    )
    r = pyrsig.legacy_get(url)
    print('Profile test')
    dfprof = from_xdr(r.content, decompress=True)

    url = (
        f'{baseurl}&TIME=2023-10-17T13:00:00Z/2023-10-17T14:59:59Z'
        + '&COVERAGE=airnow.ozone'
    )
    # Get it and decompress it
    r = pyrsig.legacy_get(url)
    print('Site test')
    dfsite = from_xdr(r.content, decompress=True)

    tempokey = open('/home/bhenders/.tempokey', 'r').read().strip()
    ckey = 'tempo.l2.no2.vertical_column_troposphere'
    url = (
        f'{baseurl}&TIME=2023-11-17T13:00:00Z/2023-11-17T13:59:59Z'
        + '&COVERAGE={ckey}&KEY={tempokey}'
    )
    # Get it and decompress it
    r = pyrsig.legacy_get(url)
    print('Swath test')
    dfswath = from_xdr(r.content, decompress=True)
Source code for pyrsig.xdr

pyrsig

Navigation

Related Topics