Source code for pyrsig.xdr

__all__ = ['from_xdrfile', 'from_xdr']


def getgridprops(gdnam, crshdr, crsparts, projparts, gridparts, vgparts):
    import numpy as np
    vgprops = {}
    vgprops['VGTYP'] = np.int32(vgparts[0])
    vgprops['VGTOP'] = np.float32(vgparts[1])
    vgprops['VGLVLS'] = np.array(vgparts[2:], dtype='f')
    gridkeys = ['NCOLS', 'NROWS', 'XORIG', 'YORIG', 'XCELL', 'YCELL']
    gridprops = dict(zip(gridkeys, gridparts))
    projattrs = {'GDNAM': gdnam}
    projattrs.update(gridprops)
    crskeys = crshdr.split(': ')[-1].split()
    crsprops = dict(zip(crskeys, crsparts))
    if 'stereographic' in crshdr:
        projattrs['GDTYP'] = 6
        projattrs['P_ALP'] = crsprops['lat_0'] / 90
        projattrs['XCENT'] = projattrs['P_BET'] = crsprops['lon_0']
        projattrs['YCENT'] = crsprops['lat_0']
        projattrs['P_GAM'] = crsprops['lat_sec']
    elif 'lcc' in crshdr:
        projattrs['GDTYP'] = 2
        projattrs['P_ALP'] = crsprops['lat_1']
        projattrs['P_BET'] = crsprops['lat_2']
        projattrs['P_GAM'] = crsprops['lon_0']
        projattrs['XCENT'] = crsprops['lon_0']
        projattrs['YCENT'] = crsprops['lat_0']
    elif 'lonlat' in crshdr:
        projattrs['GDTYP'] = 1
    else:
        raise KeyError(f'Need implement {crshdr}')
        # projattrs['GDTYP'] = 7 # merc
        # projattrs['GDTYP'] = 1 # lonlat
    return {'vertical': vgprops, 'projection': projattrs, 'crs': crsprops}


[docs]def from_xdrfile( path, na_values=None, decompress=None, as_dataframe=True, decompress_inline=True ): """ Currently supports profile, site and swath (v2.0). Each is in XDR format with a custom set of header rows in text format. The text header rows also describe the binary portion of the file. Arguments --------- path : str Path to file in XDR format with RSIG headers decompress : bool If None, use decompress if path ends in .gz If True, decompress to temporary file. If False, do not decompress to temporary file (was never compressed) decompress_inline : bool if True (default), use gzip.open to decompress and read file if False, decompress file on disk as_dataframe : bool If True (default), return data as a pandas.Dataframe. If False, return a xarray.Dataset. Only subset and grid support as_dataframe. Returns ------- df : pd.DataFrame Dataframe with XDR content """ import gzip import os import pandas as pd if decompress is None: decompress = path.endswith('.gz') fsize = os.stat(path).st_size if fsize == 20: return pd.DataFrame() elif fsize < 20: raise IOError( 'File size less than 20-bytes; likely failed to download.' ) if decompress: if decompress_inline: inf = gzip.open(path) else: temppath = path.replace('.gz', '') assert temppath != path if not os.path.exists(temppath): with gzip.open(path, 'rb') as inf: with open(temppath, 'wb') as outf: outf.write(inf.read()) inf = open(temppath, 'rb') else: inf = open(path, 'rb') outf = from_xdr( inf, decompress=decompress, na_values=na_values, as_dataframe=as_dataframe ) inf.close() return outf
[docs]def from_xdr(inf, na_values=None, decompress=False, as_dataframe=True): """ Currently supports profile, site and swath (v2.0). Each is in XDR format with a custom set of header rows in text format. The text header rows also describe the binary portion of the file. Infers RSIG format using first 40 characters. * Site 2.0: from_site * Profile 2.0: from_profile * Swath 2.0: from_swath * Point 1.0: from_point * CALIPSO 1.0: from_calipso * Polygon 1.0: from_polygon * Grid 1.0: from_grid * Subset 9.0: from_subset Arguments --------- inf : file Data file in XDR format with RSIG headers na_values : scalar Used to remove known missing values. decompress : bool If True, decompress to temporary file. If False, do not decompress to temporary file (was never compressed) as_dataframe : bool If True (default), return data as a pandas.Dataframe. If False, return a xarray.Dataset. Only subset and grid support as_dataframe. Returns ------- df : pd.DataFrame Dataframe with XDR content """ import numpy as np inf.seek(0, 0) defspec = inf.read(40).decode().split('\n')[0].lower().strip() inf.seek(0, 0) if defspec.startswith('profile'): df = from_profile(inf) elif defspec.startswith('site'): df = from_site(inf) elif defspec.startswith('point'): df = from_point(inf) elif defspec.startswith('swath'): df = from_swath(inf) elif defspec.startswith('calipso'): df = from_calipso(inf) elif defspec.startswith('polygon'): df = from_polygon(inf) elif defspec.startswith('grid'): df = from_grid(inf, as_dataframe=as_dataframe) elif defspec.startswith('subset'): df = from_subset(inf, as_dataframe=as_dataframe) elif defspec.startswith('regridded-swath'): df = from_regriddedswath(inf) else: raise IOError( f'{defspec} not in supported formats (profile, site, swath,' ' calipso, polygon, grid, subset, or regridded-swath)' ) if na_values is not None: df.replace(na_values, np.nan, inplace=True) return df
def from_polygon(bf): """ Currently supports Polygon (v1.0) which has 10 header rows in text format. The text header rows also describe the binary portion of the file, which includes three segments of data corresponding to shx, shp, and dbf files embeded within the file. Arguments --------- bf : file File opened in byte read in XDR format with RSIG headers Returns ------- df : pd.DataFrame Dataframe with XDR content Notes ----- xdrdump master_hms_smoke_2018-07-01_to_2018-07-02.xdr|m Polygon 1.0 https://www.ospo.noaa.gov/Products/land/hms.html,hmsserver 2018-07-01T00:00:00-0000 2018-07-02T23:59:59-0000 # Variable names: smoke # Variable units: ug/m3 # name shxdatabytes shpdatabytes dbfdatabytes and # shxfiledata shpfiledata dbffiledata: hms_smoke 828 30812 2468 """ import tempfile import geopandas as gpd for i in range(10): _l = bf.readline().decode().strip() if i == 0: assert (_l.lower() == 'polygon 1.0') if i == 2: dates, datee = _l.replace(':', '').replace('-0000', 'Z').split() elif i == 9: prefix, shxn, shpn, dbfn = _l.split() shxn = int(shxn) shpn = int(shpn) dbfn = int(dbfn) shxs = bf.tell() shxe = shxs + shxn shps = shxe shpe = shps + shpn dbfs = shpe dbfe = dbfs + dbfn with tempfile.TemporaryDirectory() as td: filestem = f'{td}/{prefix}_{dates}_{datee}' bf.seek(shxs, 0) with open(filestem + '.shx', 'wb') as shxf: shxf.write(bf.read(shxe - shxs)) with open(filestem + '.shp', 'wb') as shpf: shpf.write(bf.read(shpe - shps)) with open(filestem + '.dbf', 'wb') as dbff: dbff.write(bf.read(dbfe - dbfs)) outdf = gpd.read_file(filestem + '.shp') outdf.crs = 4326 return outdf def from_profile(bf): """ Currently supports Profile (v2.0) which has 14 header rows in text format. The text header rows also describe the binary portion of the file. Arguments --------- bf : file File opened in byte read in XDR format with RSIG headers Returns ------- df : pd.DataFrame Dataframe with XDR content Notes ----- # Line 1: definition spec # Line 2: Info # Line 3: Start/End date # Line 4-5: bbox # Line 6-7: Dimensions # Line 8-9: Variable names # Line 10-11: Units # Line 12-14: definition of data chunks lines # Next nprof * 80 characters are notes for each profile # Next nprof * 8 are N_p number of points for each profile as long int # Next N * sum(N_p for p in nprof) * 8 are data as 64-bit reals Example Header -------------- Profile 2.0 http://www.esrl.noaa.gov/gmd/grad/neubrew/,NEUBrewSubset 2010-07-10T00:00:00-0000 2010-07-10T23:59:59-0000 # Subset domain: <min_lon> <min_lat> <max_lon> <max_lat>: -126 24 -66 50 # Dimensions: variables profiles: 6 5 # Variable names: timestamp id longitude latitude elevation ozone # Variable units: yyyymmddhhmmss - deg deg m molecules/cm3 # char notes[profiles][80] and # MSB 64-bit integers points[profiles] and # IEEE-754 64-bit reals data_1[variables][points_1] ... data_P[variables][points_P]: """ import numpy as np import pandas as pd for i in range(14): _l = bf.readline() if i == 0: assert (_l.decode().strip().lower() == 'profile 2.0') elif i == 6: nvar, nprof = np.array(_l.decode().strip().split(), dtype='i') elif i == 8: varkeys = _l.decode().strip().split() elif i == 10: units = _l.decode().strip().split() notes = [bf.read(80).decode().strip() for i in range(nprof)] longnp = np.frombuffer(bf.read(nprof * 8), dtype='>i8') # Read all values at once dfs = [] varwunits = [varkey + f'({units[i]})' for i, varkey in enumerate(varkeys)] for i, npoints in enumerate(longnp): nfloats = npoints * nvar vals = np.frombuffer(bf.read(nfloats * 8), dtype='>d').reshape( nvar, npoints ) vals = vals.byteswap().view(vals.dtype.newbyteorder()) df = pd.DataFrame(dict(zip(varwunits, vals))) df['NOTE'] = notes[i] dfs.append(df) df = pd.concat(dfs) infmt = '%Y%m%d%H%M%S' outfmt = '%Y-%m-%dT%H:%M:%S%z' ntimestamps = df['timestamp(yyyymmddhhmmss)'] timestamps = pd.to_datetime( ntimestamps, format=infmt, utc=True ).dt.strftime(outfmt) df.drop('timestamp(yyyymmddhhmmss)', axis=1, inplace=True) df['Timestamp(UTC)'] = timestamps renames = { 'id(-)': 'STATION(-)', 'longitude(deg)': 'LONGITUDE(deg)', 'latitude(deg)': 'LATITUDE(deg)', 'elevation(m)': 'ELEVATION(m)' } renames = {k: v for k, v in renames.items() if k in df.columns} df.rename(columns=renames, inplace=True) frontkeys = [ 'Timestamp(UTC)', 'LONGITUDE(deg)', 'LATITUDE(deg)', 'ELEVATION(m)', 'STATION(-)' ] lastkeys = ['NOTE'] outkeys = frontkeys + [ k for k in df.columns if k not in (frontkeys + lastkeys) ] + lastkeys df = df[outkeys] return df def from_swath(bf): """ Currently supports Swath (v2.0) which has 14 header rows in text format. The text header rows also describe the binary portion of the file. Arguments --------- bf : file File opened in byte read in XDR format with RSIG headers Returns ------- df : pd.DataFrame Dataframe with XDR content Notes ----- # 14 header rows to be read using \n # Line 1: definition spec # Line 2: Info # Line 3: Start/End date # Line 4-5: Dimensions # Line 6-7: Variable names # Line 8-9: Units # Line 10-11: bbox # Line 12-14: definition of data chunks lines # Next nsite * 4 are number of times for each site (N_s) as 32-bit int # Next nvar * sum(2 for s in nsite) * 4 are data as 64-bit reals # Next nvar * sum(N_s for s in nsite) * 4 are data as 64-bit reals Swath 2.0 http://www.star.nesdis.noaa.gov/smcd/emb/viirs_aerosol/,VIIRSSubset 2013-06-15T00:00:00-0000 # Dimensions: variables timesteps scans: 12 24 4 # Variable names: Longitude Latitude Scan_Start_Time faot550 Longitude_SW Longitude_SE ... Longitude_NW Longitude_NE Latitude_SW Latitude_SE Latitude_NW ... Latitude_NE # Variable units: deg deg YYYYDDDHHMM - deg deg deg deg deg deg deg deg # Domain: <min_lon> <min_lat> <max_lon> <max_lat> -113 26 -111 26.5 # MSB 64-bit integers (yyyydddhhmm) timestamps[scans] and # MSB 64-bit integers points[scans] and # IEEE-754 64-bit reals data_1[variables][points_1] ... data_S[variables][points_S]: """ import numpy as np import pandas as pd for i in range(14): _l = bf.readline() if i == 0: defspec = _l.decode().strip().lower() elif i == 2: pass # stime = _l.decode().strip() elif i == 4: nvar, nt, nscan = np.array(_l.decode().strip().split(), dtype='i') elif i == 6: varkeys = _l.decode().strip().split() elif i == 8: units = _l.decode().strip().split() assert (defspec == defspec) # MSB 64-bit integers (yyyydddhhmm) timestamps[scans] nts = np.frombuffer(bf.read(nscan * 8), dtype='>i8') # MSB 64-bit integers points[scans] nps = np.frombuffer(bf.read(nscan * 8), dtype='>i8') infmt = '%Y%j%H%M' outfmt = '%Y-%m-%dT%H:%M:%S%z' timestamps = pd.to_datetime(nts, format=infmt, utc=True).strftime(outfmt) # Read all values at once # IEEE-754 64-bit reals data_1[variables][points_1] ... # data_S[variables][points_S] dfs = [] varwunits = [varkey + f'({units[i]})' for i, varkey in enumerate(varkeys)] # To-do # Switch from iterative unpacking to numpy.frombuffer, which is much faster # this requires some fancy indexing and complex repeats. for i, npoints in enumerate(nps): nfloats = npoints * nvar vals = np.frombuffer(bf.read(nfloats * 8), dtype='>d').reshape( nvar, npoints ) vals = vals.byteswap().view(vals.dtype.newbyteorder()) df = pd.DataFrame(dict(zip(varwunits, vals))) df['Timestamp(UTC)'] = timestamps[i] dfs.append(df) df = pd.concat(dfs) renames = { 'Longitude(deg)': 'LONGITUDE(deg)', 'Latitude(deg)': 'LATITUDE(deg)', } renames = {k: v for k, v in renames.items() if k in df.columns} df.rename(columns=renames, inplace=True) outkeys = ['Timestamp(UTC)', 'LONGITUDE(deg)', 'LATITUDE(deg)'] outkeys = outkeys + [k for k in df if k not in outkeys] df = df[outkeys] return df def from_site(bf): """ Currently supports Site (v2.0) which has 14 header rows in text format. The text header rows also describe the binary portion of the file. Arguments --------- bf : file File opened in byte read in XDR format with RSIG headers Returns ------- df : pd.DataFrame Dataframe with XDR content Notes ----- # 13 header rows to be read using \n # Line 1: definition spec # Line 2: Info # Line 3: Start/End date # Line 4-5: Dimensions # Line 6-7: Variable names # Line 8-9: Units # Line 10-13: definition of data chunks lines # Next nsite * 80 characters are notes for each profile # Next nsite * 4 are IDs number of times for each site as 32-bit integers # or 64-bit integers. # Next sum(2 for s in nsite) * 4 are data as 32-bit reals # or 64-bit integers. # Next sum(N_s for s in nsite) * 4 are data as 32-bit reals # or 64-bit integers. Example Header -------------- SITE 2.0 http://airnow.gov/,reformat_airnow_obs,SiteSubset 2005-08-26T00:00:00-0000 # data dimensions: timesteps stations 3 9 # Variable names: PM25 # Variable units: ug/m3 # char notes[stations][80] and # MSB 64-bit integers ids[stations] and # IEEE-754 64-bit reals sites[stations][2=<longitude,latitude>] and # IEEE-754 64-bit reals data[timesteps][stations]: """ import numpy as np import pandas as pd for i in range(13): _l = bf.readline() if i == 0: assert (_l.decode().strip().lower() == 'site 2.0') elif i == 2: stime = _l.decode().strip() elif i == 4: nt, nsite = np.array(_l.decode().strip().split(), dtype='i') elif i == 6: varkeys = _l.decode().strip().split() elif i == 8: units = _l.decode().strip().split() elif i == 10: idtype = _l.decode().strip() elif i == 11: xytype = _l.decode().strip() elif i == 12: valtype = _l.decode().strip() idtype = np.dtype({True: '>i8', False: '>i4'}['64' in idtype]) xytype = np.dtype({True: '>f8', False: '>f4'}['64' in xytype]) valtype = np.dtype({True: '>f8', False: '>f4'}['64' in valtype]) time = pd.date_range(stime, periods=nt, freq='h') notes = [bf.read(80).decode().strip() for i in range(nsite)] buff = bf.read(nsite * idtype.itemsize) nis = np.frombuffer(buff, dtype=idtype) buff = bf.read(nsite * xytype.itemsize * 2) xys = np.frombuffer(buff, dtype=xytype).reshape(nsite, 2) # Read all values at once varwunits = [varkey + f'({units[i]})' for i, varkey in enumerate(varkeys)] # Unpacking data using frombuffer, which is much faster than iteratively # calling xdr.unpack_farray vals = np.frombuffer(bf.read(), dtype=valtype).reshape(1, nsite, nt) atimes = np.array(time, ndmin=1)[None, :].repeat(nsite, 0).ravel() anotes = np.array(notes)[:, None].repeat(nt, 1).T.ravel() astation = nis[:, None].repeat(nt, 1).T.ravel() axs = xys[:, 0, None].repeat(nt, 1).T.ravel() ays = xys[:, 1, None].repeat(nt, 1).T.ravel() data = { 'Timestamp(UTC)': atimes, 'SITE_NAME': anotes, 'STATION(-)': astation.astype(f'={astation.dtype.char}'), 'LONGITUDE(deg)': axs.astype(f'={axs.dtype.char}'), 'LATITUDE(deg)': ays.astype(f'={ays.dtype.char}'), } for vi, varwunit in enumerate(varwunits): data[varwunit] = vals[vi].ravel().astype(f'={vals.dtype.char}') df = pd.DataFrame(data) # Serves as a comment on how this was done iteratively. """ dfs = [] for i, id in enumerate(nis): lon, lat = xys[i] vals = np.array(up.unpack_farray(nt, up.unpack_float)).reshape(1, nt) #assert np.allclose(valchk[0, i], vals) df = pd.DataFrame(dict(zip(varwunits, vals))) df['Timestamp(UTC)'] = time df['SITE_NAME'] = notes[i] df['STATION(-)'] = id df['LONGITUDE(deg)'] = lon df['LATITUDE(deg)'] = lat dfs.append(df) df = pd.concat(dfs) """ frontkeys = [ 'Timestamp(UTC)', 'LONGITUDE(deg)', 'LATITUDE(deg)', 'STATION(-)' ] lastkeys = ['SITE_NAME'] outkeys = frontkeys + [ k for k in df.columns if k not in (frontkeys + lastkeys) ] + lastkeys df = df[outkeys] return df def from_point(bf): """ Currently supports Point (v1.0) which has 12 header rows in text format. The text header rows also describe the binary portion of the file. Arguments --------- bf : file File opened in byte read in XDR format with RSIG headers Returns ------- df : pd.DataFrame Dataframe with XDR content Notes ----- # 12 header rows to be read using \n # Line 1: definition spec # Line 2: Info # Line 3: Start/End date # Line 4-5: Dimensions # Line 6-7: Variable names # Line 8-9: Units # Line 10-12: definition of data chunks lines # Next npoint * 80 characters are notes for each point # Next nvariable * npoint * 8 bytes are data[variables][] Example Header -------------- Point 1.0 https://satepsanone.nesdis.noaa.gov/pub/FIRE/BBEP-geo,GOESBBSubset 2018-11-09T00:00:00-00002018-11-10T23:00:00-0000 # Dimensions: variables points: 4 1876 # Variable names: Timestamp Longitude Latitude PM25_emission # Variable units: yyyymmddhhmmss deg deg kg # IEEE-754 64-bit reals data[variables][points]: """ import numpy as np import pandas as pd for i in range(11): _l = bf.readline() if i == 0: assert (_l.decode().strip().lower() == 'point 1.0') elif i == 2: # stime = _l.decode().strip() pass # not loading time because it is duplicative elif i == 4: nvar, npoint = np.array(_l.decode().strip().split(), dtype='i') elif i == 6: varkeys = _l.decode().strip().split() elif i == 8: units = _l.decode().strip().split() notes = [bf.read(80).decode().strip() for i in range(npoint)] # Use numpy to unpack frombuffer becuase it is faster than unpack vals = np.frombuffer(bf.read(), dtype='>d').reshape(nvar, npoint) vals = vals.byteswap().view(vals.dtype.newbyteorder()) varkeys = [ {'id': 'STATION'}.get(varkey, varkey).upper() for varkey in varkeys ] varwunits = [varkey + f'({units[i]})' for i, varkey in enumerate(varkeys)] data = { 'NOTE': notes, } for vi, varwunit in enumerate(varwunits): data[varwunit] = vals[vi].ravel().astype(f'={vals.dtype.char}') df = pd.DataFrame(data) infmt = '%Y%m%d%H%M%S' outfmt = '%Y-%m-%dT%H:%M:%S%z' ntimestamps = df['TIMESTAMP(yyyymmddhhmmss)'] timestamps = pd.to_datetime( ntimestamps, format=infmt, utc=True ).dt.strftime(outfmt) df.drop('TIMESTAMP(yyyymmddhhmmss)', axis=1, inplace=True) ti = varwunits.index('TIMESTAMP(yyyymmddhhmmss)') varwunits[ti] = 'Timestamp(UTC)' df['Timestamp(UTC)'] = timestamps outkeys = varwunits + ['NOTE'] df = df[outkeys].rename( columns={'PM25_ATM_HOURLY(ug/m3)': 'pm25_atm_hourly(ug/m3)'} ) return df def from_calipso(bf): """ Currently supports Calipso (v1.0) which has 15 header rows in text format. The text header rows also describe the binary portion of the file. Arguments --------- bf : file File opened in byte read in XDR format with RSIG headers Returns ------- df : pd.DataFrame Dataframe with XDR content Example Header -------------- CALIPSO 1.0 https://eosweb.larc.nasa.gov/project/calipso/calipso_table,CALIPSOSubset 2016-05-04T00:00:00-0000 # Dimensions: variables timesteps profiles: 5 24 8 # Variable names: VariableName is Total_Backscatter_Coefficient_532 Profile_UTC_Time Longitude Latitude Elevation VariableName # Variable units: yyyymmdd.f deg deg m per_kilometer_per_steradian # Domain: <min_lon> <min_lat> <max_lon> <max_lat> -126 24 -66 50 # MSB 64-bit integers (yyyydddhhmm) profile_timestamps[profiles] and # IEEE-754 64-bit reals profile_bounds[profiles][2=<lon,lat>][2=<min,max>] # MSB 64-bit integers profile_dimensions[profiles][2=<points,levels>] and # IEEE-754 64-bit reals profile_data_1[variables][points_1][levels] # ... profile_data_S[variables][points_S][levels]: """ import numpy as np import pandas as pd import xarray as xr headerlines = [] for i in range(15): _l = bf.readline().decode() headerlines.append(_l) if i == 0: assert (_l.strip().lower() == 'calipso 1.0') elif i == 2: # stime = _l.strip() pass # not loading time because it is duplicative elif i == 4: nvar, ntime, nprof = np.array(_l.strip().split(), dtype='i') elif i == 6: varkeys = _l.strip().split() elif i == 8: units = _l.strip().split() elif i == 10: # bbox = [float(d) for d in _l.strip().split()] pass # not loading time because it is duplicative stime = bf.tell() etime = stime + nprof * 8 sbnd = etime ebnd = sbnd + (nprof * 4) * 8 sdims = ebnd edims = sdims + (nprof * 2) * 8 sdata = edims # time = np.frombuffer(bf.read(etime - stime), dtype='>i8') # bshape = [nprof, 2, 2] # tbnds = np.frombuffer(bf.read(ebnd - sbnd), dtype='>d').reshape(*bshape) bf.seek(sdims, 0) dims = np.frombuffer(bf.read(edims - sdims), dtype='>i8').reshape(nprof, 2) if ( not (dims[:, 1] == dims[0, 1]).all() ): raise IOError( 'CALIPSO 1.0 xdr reader needs to be updated; levels vary. Use' + ' ASCII backend instead of xdr until resolved. Please report to' + ' https://github.com/barronh/pyrsig/issues' ) times = [] lons = [] lats = [] elev = [] data = [] for iprof, (npoint, nlev) in enumerate(dims): edata = sdata + (3 * npoint + 2 * npoint * nlev) * 8 vals = np.frombuffer(bf.read(edata - sdata), dtype='>d') vals = vals.byteswap().view(vals.dtype.newbyteorder()) sd = 0 ed = sd + npoint * 3 ptimes, plons, plats = vals[sd:ed].reshape(3, npoint) sd = ed ed = sd + npoint * nlev * 2 pelev, pdata = vals[sd:ed].reshape(2, npoint, nlev) times.append(ptimes) lons.append(plons) lats.append(plats) elev.append(pelev) data.append(pdata) sdata = edata ds = xr.Dataset() alldata = [times, lons, lats, elev, data] for key, vals, unit in zip(varkeys, alldata, units): attrs = dict(long_name=key, units=unit) dims = {1: ('points',), 2: ('points', 'levels')}[vals[0].ndim] ds[key] = (dims, np.concatenate(vals, axis=0), attrs) ds.attrs['description'] = '\n'.join(headerlines) df = ds.to_dataframe().reset_index(drop=True) renamer = {key: f'{key}({unit})' for key, unit in zip(varkeys, units)} renamer['Latitude'] = 'LATITUDE(deg)' renamer['Longitude'] = 'LONGITUDE(deg)' renamer['Elevation'] = 'ELEVATION(m)' df['Timestamp(UTC)'] = ( pd.to_datetime(df['Profile_UTC_Time'] // 1, utc=True) + pd.to_timedelta(df['Profile_UTC_Time'] % 1, unit='d') ).dt.round('1s') df = df[['Timestamp(UTC)'] + varkeys[1:-1] + varkeys[:1] + varkeys[-1:]] df.rename(columns=renamer, inplace=True) return df def from_grid(bf, as_dataframe=True): """ Currently supports Grid (v1.0) which has 12 header rows in text format. The text header rows also describe the binary portion of the file. Arguments --------- bf : file File opened in byte read in XDR format with RSIG headers as_dataframe : bool If True (default), return data as a pandas.Dataframe. If False, return a xarray.Dataset. Returns ------- df : pd.DataFrame Dataframe with XDR content Example Header -------------- xdrdump master_hrrr_wind_2020-02-17.xdr Grid 1.0 http://home.chpc.utah.edu/~u0553130/Brian_Blaylock/,HRRRSubset 2020-02-17T00:00:00-0000 # Dimensions: timesteps variables rows columns: 24 2 85 73 # Variable names: wind_u wind_v # Variable units: m/s m/s # IEEE-754 64-bit reals longitudes[rows][columns] and # IEEE-754 64-bit reals latitudes[rows][columns] and # IEEE-754 64-bit reals data[timesteps][variables][rows][columns]: -7.8518169999999998e+01 -7.8484610000000004e+01 -7.8451070000000001e+01 ... """ import numpy as np import pandas as pd import xarray as xr headerlines = [] for i in range(12): _l = bf.readline().decode().strip() headerlines.append(_l) if i == 0: assert (_l.lower() == 'grid 1.0') elif i == 1: rsig_program = _l elif i == 2: stime = pd.to_datetime(_l) elif i == 4: ntime, nvar, nrow, ncol = np.array(_l.split(), dtype='i') elif i == 6: varkeys = _l.split() elif i == 8: units = _l.split() elif i == 10: # bbox = [float(d) for d in _l.strip().split()] pass # not loading time because it is duplicative nbytes = nrow * ncol * 8 lon = np.frombuffer(bf.read(nbytes), dtype='>d').reshape(nrow, ncol) lon = lon.byteswap().view(lon.dtype.newbyteorder()) lat = np.frombuffer(bf.read(nbytes), dtype='>d').reshape(nrow, ncol) lat = lat.byteswap().view(lat.dtype.newbyteorder()) dshape = ntime, nvar, nrow, ncol nbytes = np.prod(dshape) * 8 data = np.frombuffer(bf.read(nbytes), dtype='>d').reshape(*dshape) data = data.byteswap().view(data.dtype.newbyteorder()) ds = xr.Dataset() ds.attrs['rsig_program'] = rsig_program dt = pd.to_timedelta('3600s') reftime = '1970-01-01T00:00:00+0000' attrs = dict(units=f'seconds since {reftime}', long_name='time') ds.coords['time'] = ('time',), stime + np.arange(ntime) * dt, attrs attrs = dict(units='index', long_name='x_center') ds.coords['x'] = ('x',), np.arange(ncol, dtype='d') + 0.5 attrs = dict(units='index', long_name='y_center') ds.coords['y'] = ('y',), np.arange(nrow, dtype='d') + 0.5 attrs = dict(units='degrees_north', long_name='latitude') ds['lat'] = ('y', 'x'), lat, attrs attrs = dict(units='degrees_east', long_name='longitude') ds['lon'] = ('y', 'x'), lon, attrs for vi, varkey in enumerate(varkeys): attrs = dict(long_name=varkey, units=units[vi]) ds[varkey] = ('time', 'y', 'x'), data[:, vi, :, :], attrs if as_dataframe: df = ds.to_dataframe().astype('d') time = df.index.get_level_values('time') df['Timestamp(UTC)'] = time.strftime('%Y-%m-%dT%H:%M:%S+0000') keepcols = ['Timestamp(UTC)', 'lon', 'lat'] + varkeys renamer = {vk: f'{vk}({vu})' for vk, vu in zip(varkeys, units)} renamer['lon'] = 'LONGITUDE(deg)' renamer['lat'] = 'LATITUDE(deg)' df = df[keepcols].rename(columns=renamer) return df else: return ds def from_subset(bf, as_dataframe=True): """ Currently supports Subset (v9.0) which has 17 header rows in text format. The text header rows also describe the binary portion of the file. This data can be very large for high resolution continental data. For example, CMAQ EQUATES CONUS for one 4D variable requires 4GB of memory as a dataframe whereas the same data from CMAQ EQUATES HEMI is just 0.065GB. Both are much smaller as a Dataset where coordinate data is not repeated. Arguments --------- bf : file File opened in byte read in XDR format with RSIG headers as_dataframe : bool If True (default), return data as a pandas.Dataframe. If False, return a xarray.Dataset. Returns ------- df : pd.DataFrame Dataframe with XDR content Notes ----- /xdrdump master_cmaq_amad_hemi_ext_2006-08-28.xdr |m SUBSET 9.0 CMAQ 108NHEMI1 http://www.epa.gov/ttn/scram/,CMAQSubset 2006-08-28T00:00:00-0000 # data dimensions: timesteps variables layers rows columns: 1 4 35 187 187 # subset indices (0-based time, 1-based layer/row/column): first-timestep ... last-timestep first-layer last-layer first-row last-row first-column ... last-column: 0 0 1 35 1 187 1 187 # Variable names: LONGITUDE LATITUDE ELEVATION EXT_Recon # Variable units: deg deg m 1/km # stereographic projection: lat_0 lon_0 lat_sec major_semiaxis ... minor_semiaxis 90 -98 45 6.37e+06 6.37e+06 # Grid: ncols nrows xorig yorig xcell ycell vgtyp vgtop vglvls[36]: 187 187 -1.0098e+07 -1.0098e+07 108000 108000 7 5000 1 0.9975 0.995 ... # IEEE-754 32-bit reals data[variables][timesteps][layers][rows][columns]: -1.4300000000000000e+02 -1.4269029235839844e+02 ... """ import numpy as np import pandas as pd import xarray as xr from .utils import get_proj4 import pyproj headerlines = [] for i in range(17): _l = bf.readline().decode().strip() headerlines.append(_l) if i == 0: assert (_l.lower() == 'subset 9.0 cmaq') elif i == 1: gdnam = _l elif i == 2: rsig_program = _l elif i == 3: stime = pd.to_datetime(_l) elif i == 5: ntime, nvar, nlay, nrow, ncol = np.array(_l.split(), dtype='i') elif i == 7: ftime, ltime, flay, llay, frow, lrow, fcol, lcol = np.array( _l.split(), dtype='i' ) elif i == 9: varkeys = _l.split() elif i == 11: units = _l.split() elif i == 12: crshdr = _l.lower() elif i == 13: crsparts = np.array(_l.split(), dtype='d') elif i == 15: projparts = np.array(_l.split(), dtype='d') vgparts = np.array(projparts[-nlay - 3:], dtype='f') gridparts = projparts[:-len(vgparts)] elif i == 16: is64 = '64-bit' in _l if is64: dtval = '>d' else: dtval = '>f' gridprops = getgridprops( gdnam, crshdr, crsparts, projparts, gridparts, vgparts ) vgprops = gridprops['vertical'] vglvls = vgprops['VGLVLS'] projattrs = gridprops['projection'] earth_radius = gridprops['crs']['major_semiaxis'] proj4 = get_proj4(projattrs, earth_radius=earth_radius) proj = pyproj.Proj(proj4) data = np.frombuffer(bf.read(), dtype=dtval).reshape( nvar, ntime, nlay, nrow, ncol ) ds = xr.Dataset() ds.attrs.update(**projattrs) ds.attrs.update(**vgprops) ds.attrs['UPNAM'] = rsig_program.ljust(16) dt = pd.to_timedelta('3600s') t = stime + np.arange(ntime) * dt reftime = '1970-01-01T00:00:00+0000' attrs = dict(long_name='TSTEP', units=f'seconds since {reftime}') ds.coords['TSTEP'] = t z = ((vglvls[1:] + vglvls[:-1]) / 2)[flay - 1:llay] attrs = dict(long_name='LAY', units='layer_center') ds.coords['LAY'] = ('LAY',), z, attrs y = np.arange(nrow) + frow - 0.5 attrs = dict(long_name='ROW', units='cell_center') ds.coords['ROW'] = ('ROW',), y, attrs x = np.arange(ncol) + fcol - 0.5 attrs = dict(long_name='COL', units='cell_center') ds.coords['COL'] = ('COL',), x, attrs Y, X = xr.broadcast(ds['ROW'], ds['COL']) LON, LAT = proj(X.values, Y.values, inverse=True) attrs = dict(long_name='LONGITUDE'.ljust(16), units='degrees_east ') attrs['var_desc'] = attrs['long_name'].ljust(80) ds['LONGITUDE'] = ('ROW', 'COL'), LON.astype('f'), attrs attrs = dict(long_name='LATITUDE'.ljust(16), units='degrees_north ') attrs['var_desc'] = attrs['long_name'].ljust(80) ds['LATITUDE'] = ('ROW', 'COL'), LAT.astype('f'), attrs dims = ('TSTEP', 'LAY', 'ROW', 'COL') for vi, vk in enumerate(varkeys): attrs = dict( units=units[vi].ljust(16), long_name=vk.ljust(16), var_desc=vk.ljust(80) ) vals = data[vi] ds[vk] = dims, vals, attrs if as_dataframe: df = ds.astype(dtval[1:]).to_dataframe() time = df.index.get_level_values('TSTEP') df['Timestamp(UTC)'] = time.strftime('%Y-%m-%dT%H:%M:%S+0000') keepcols = ['Timestamp(UTC)', 'LONGITUDE', 'LATITUDE'] + varkeys renamer = {vk: f'{vk}({vu})' for vk, vu in zip(varkeys, units)} renamer['LONGITUDE'] = 'LONGITUDE(deg)' renamer['LATITUDE'] = 'LATITUDE(deg)' df = df[keepcols].rename(columns=renamer) return df else: return ds def from_regriddedswath(bf): """ Currently supports regridded-swath (v2.0) which has 19 header rows in text format. The text header rows also describe the binary portion of the file. This data can be very large for high resolution continental data. For example, CMAQ EQUATES CONUS for one 4D variable requires 4GB of memory as a dataframe whereas the same data from CMAQ EQUATES HEMI is just 0.065GB. Both are much smaller as a Dataset where coordinate data is not repeated. Arguments --------- bf : file File opened in byte read in XDR format with RSIG headers as_dataframe : bool If True (default), return data as a pandas.Dataframe. If False, return a xarray.Dataset. Notes ----- In progress. Returns ------- df : pd.DataFrame Dataframe with XDR content Notes ----- xdrdump master_regridded_viirs_ivaot_faot550_2013-06-15.xdr|m REGRIDDED-Swath 2.0 http://....nesdis.noaa.gov/smcd/emb/viirs_aerosol/,VIIRSSubset,XDRConvert 2013-06-15T20:00:00-0000 # timesteps 2 # Variable name: faot550 # Variable units: - # stereographic proj: lat_0 lon_0 lat_sec major_semiaxis minor_semiaxis 90 -98 45 6370000.000000 6370000.000000 # Grid: ncols nrows xorig yorig xcell ycell vgtyp vgtop vglvls[2]: 137 137 -7398000.0 -7398000.0 108000.0 108000.0 2 10000.0 1 0.995 # MSB 64-bit integers points[timesteps] and # IEEE-754 64-bit reals longitudes[timesteps][points] and # IEEE-754 64-bit reals latitudes[timesteps][points] and # MSB 64-bit integers columns[timesteps][points] and # MSB 64-bit integers rows[timesteps][points] and # IEEE-754 64-bit reals data[timesteps][points]: 154 719 -1.2970142966950573e+02 -1.2908750532347997e+02 -1.2783553910026154e+02 ... """ import numpy as np import pandas as pd # from .utils import get_proj4 headerlines = [] for i in range(19): _l = bf.readline().decode().strip() headerlines.append(_l) if i == 0: assert (_l.lower() == 'regridded-swath 2.0') elif i == 1: rsig_program = _l elif i == 2: stime = pd.to_datetime(_l) elif i == 4: ntime, = np.array(_l.split(), dtype='i') elif i == 6: varkeys = _l.split() elif i == 8: units = _l.split() elif i == 9: crshdr = _l.lower() elif i == 10: crsparts = np.array(_l.split(), dtype='d') elif i == 12: projparts = np.array(_l.split(), dtype='d') nlay = 1 vgparts = np.array(projparts[-nlay - 3:], dtype='f') gridparts = projparts[:-len(vgparts)] dt = pd.to_timedelta('1h') times = [] points = np.frombuffer(bf.read(ntime * 8), dtype='>i8') outfmt = '%Y-%m-%dT%H:%M:%S%z' dts = pd.Series(np.concatenate([ np.ones(points[ti]) * ti * dt for ti in range(ntime) ])) times = (stime + dts).dt.strftime(outfmt).values npoints = points.sum() lons = np.frombuffer(bf.read(npoints * 8), dtype='>d') lats = np.frombuffer(bf.read(npoints * 8), dtype='>d') cols = np.frombuffer(bf.read(npoints * 8), dtype='>i8') rows = np.frombuffer(bf.read(npoints * 8), dtype='>i8') vals = np.frombuffer(bf.read(npoints * 8), dtype='>d') valkey = f'{varkeys[0]}({units[0]})' gdn = 'unknown' # gdnam gattrs = getgridprops(gdn, crshdr, crsparts, projparts, gridparts, vgparts) df = pd.DataFrame({ 'Timestep(UTC)': times.astype(f'={times.dtype.char}'), 'LONGITUDE(deg)': lons.astype(f'={lons.dtype.char}'), 'LATITUDE(deg)': lats.astype(f'={lats.dtype.char}'), 'COLUMN(-)': cols.astype(f'={cols.dtype.char}'), 'ROW(-)': rows.astype(f'={rows.dtype.char}'), valkey: vals.astype(f'={vals.dtype.char}'), }) df.attrs.update(gattrs) df.attrs['rsig_program'] = rsig_program return df if __name__ == '__main__': import pyrsig baseurl = ( 'https://ofmpub.epa.gov/rsig/rsigserver?SERVICE=wcs&VERSION=1.0.0' + '&REQUEST=GetCoverage&FORMAT=xdr' ) baseurl += '&BBOX=-130.0,20.,-65.,60&COMPRESS=1' ckey = 'pandora.L2_rnvs3p1_8.nitrogen_dioxide_vertical_column_amount' url = ( f'{baseurl}&TIME=2023-10-17T13:00:00Z/2023-10-17T13:59:59Z' + f'&COVERAGE={ckey}' ) r = pyrsig.legacy_get(url) print('Profile test') dfprof = from_xdr(r.content, decompress=True) url = ( f'{baseurl}&TIME=2023-10-17T13:00:00Z/2023-10-17T14:59:59Z' + '&COVERAGE=airnow.ozone' ) # Get it and decompress it r = pyrsig.legacy_get(url) print('Site test') dfsite = from_xdr(r.content, decompress=True) tempokey = open('/home/bhenders/.tempokey', 'r').read().strip() ckey = 'tempo.l2.no2.vertical_column_troposphere' url = ( f'{baseurl}&TIME=2023-11-17T13:00:00Z/2023-11-17T13:59:59Z' + '&COVERAGE={ckey}&KEY={tempokey}' ) # Get it and decompress it r = pyrsig.legacy_get(url) print('Swath test') dfswath = from_xdr(r.content, decompress=True)