Re: [netcdfgroup] storing sparse matrices data in NetCDF

  • To: netcdfgroup@xxxxxxxxxxxxxxxx
  • Subject: Re: [netcdfgroup] storing sparse matrices data in NetCDF
  • From: Sourish Basu <Sourish.Basu@xxxxxxxxxxxx>
  • Date: Mon, 18 Mar 2019 15:15:47 -0600
  • Autocrypt: addr=Sourish.Basu@xxxxxxxxxxxx; prefer-encrypt=mutual; keydata= mQINBFZ40gQBEADvGCqRa5XGgaROg6TYUieMAh5GDTy6lclqxdKqu4oYSROUFYkEuT4tOHpV 4k6Ruhg1EYXMl0siTQ4VsTcvaFBR3RLKiOdRxsh4jPVrZI1TWJPJRWlvNg5iPXczTkH3diyA 2Pp4CBNfpw/M5uUHMgfL/A+1AAT0ciMUq9eR5U8bcjvfemg6Js/+tsNOwyHGlZEEXOrod/eb NrqOB1FA0WFHvEkFgJ1Ed/g3ulu0ylel0HoC/rCv0pU/PX+wNqucQbk00xjOw8ts02keX2z+ LQumHPWfSNrQpPh5u4L6XLAcc0RjEig6WkwHJQtjdEIoI+TNXtrDdQ09yrOg1dgQwz/kgnp4 oLBLjs2K3XMymsSorPCcInAoG3kchRadsmv69WX+YxIWPJaKOVrNs5K7Jf0nX9cyGe5Q1XuW 8sbk7IGKux9sy0S9HaYqU/w6HUhGl85522ogCeyZ7xnZoKuPthHGOBULsS8YD7BqYrxgNwnA JsDgefkSqLUiKixie/Tb8V/dSDpFfXEm6/ixNPm6iXZAza8ZVreQqQ3gs44VIHZi1c0qHtC0 kP2dx3IujfkdeUL3g2m84GMHmeG5Q207P50rqWk0kPzHRu5xMDQYICQ4wJwaEo69oM776sQ7 YxRXmFqzo2UX45rTVoq5xaOS0NRFteN4lJDB35911h5El46l+QARAQABtChTb3VyaXNoIEJh c3UgPFNvdXJpc2guQmFzdUBjb2xvcmFkby5lZHU+iQI4BBMBAgAiBQJWeNrBAhsjBgsJCAcD AgYVCAIJCgsEFgIDAQIeAQIXgAAKCRDdna2p+Lv9IC5NEADjZ9A1SZWzlN/trUcRIL9Vt2xZ oesJDGbv24gXUTbe7O0aSB6EfQCCBS2wRCjtvHGBaTEMbL0oGYTIBS2VZg/xL4LFXtKqwkWe 27Z+6erRGpIVCvO2gj5uVLBvi6MGrxc+TNfKSsH+6sxnL0lHZe2H9ptpn4+RXlSchQyH9x9D qf0o9C3iUxVthdwzfS6lpJsXnTM7DfLZz/2vr7eSfTYh106fQU++WUE4KcWcH/p/DG9R0hRg e2WJQ5oVuFS7tPKJuRyEBfhDuk92HAviLg/FgisfTNNRsrVaQJfBI3sDweTV/ueP7D9TqByF l+6Xl1h3gflMhTX6llQmTHXYtU30fjk8V8yjEr90mfpBbdnWbqbI0kqCqa4f+X1L0F20vSrf slhq0JWsZR96yljonXorW93aYu/4LCvO5AGtSx4LUTX7/jVM2DWQfa59/Ioqygz0V2EYDzQF poST5TznXPlsz+0kIUzUoLjv9+ES93idZt8rRNmklHdOyA9eEAIqv/b2mXSpYMjk7HMbxPgR h+jX7WiyfbFi12z4ApQR8MbHe5iZwTsrMhwlzoJEDp4tL3NcWf5nQoRnjZ+6i0zNzMy4T1Jp LIcK82J/Vam93eLXm4E8wJbz70VEuSGO8Ei1nevG4BwSNw9vbjo0t5GvCLt93RKBXsNaJ5y4 C9W/ewSg+rkCDQRWeNIEARAAypsoemKwvg4pZv54DXN/bkmWTgSiYHWQqrUhMyP5UTi3hWw6 yuXtcDJ8QlHE9TBzO+JIKmf5q8ueANV7Rj1XXk2HiqDLggHgFy7lT/Vjv/cxp7l36kSn9iFM Y/pkg5C297g/dOmuxP/igInh5tpkIHU9qbbAGjLmplR95MEowivJKPbgs6QIFGcfuHCxNz+w 9vgqG+oZmtG9yE34/vS651v/9qJc4WW2t/oywUCm5ti/FwLV0MJ7hXmK48DpTzAVo5bAwkWB ALFvIbgGShncg5Ubn2xxe2dkgUAdxhX6bWPA3P7mC+3xrHtV0uRCBbYDCDH8LOGPWKK0poRn iUcWlKY6PAGSiAXzBmgex3lv/EymYUHH4D1QJTxaoLpO/8O17AharvkuAD2Wi5s3j/9PiE5d ilxww5Df+43memityqJzFoFLgvlftXYsnQ5dsJGXOVhnf5IE+xzWnP/W5qTuDswlV8ZJ4OjT +KiZkePhirXiKLObcwpODZ97VCE0O1DHOWNfuvg6aQd97FHo51wRs2CI5SBa2xbpEhbwKu3p Py11Lkn0NQ/3qPrnKOs4bxb5nn+mUGkLMeQLantWmnWF8r7WxELkf+06jYliG6LTdCmVld6r PqW8E6/KQaZJXjRcbJ01b80IilyFCE9l2uA+ZgVieCHWHFuQ28+yNscvJVEAEQEAAYkCHwQY AQIACQUCVnjSBAIbDAAKCRDdna2p+Lv9IPQUEACPYzMYudTbWC9w615+fpW6kZdWXRByGCqJ G8fM2zkADi521ZH3nzWzdOjAxXZ94ujEUuNMeEBDlk4lmmb1i4jstyRWf5FJBqbGM52PiPMn 5mcI8GzIayvYMugDCoyMH1WGEI3lmQbIAr7kkyjLDbhTa74YmMvzgtmHRgDSHqHqAKCrBKde HqkvxEFu8clL50KRsUm45RU4BOupbzHnw2zxzhEmK1PJaJ5WqCMSX4icftGlkWNEJq3KmmSf JPmIO48ACXneslTzF3hRTslEreAHYvQJprYZDj3Cr1ttftCcrhs7L4Fz64BhYBoye+j78z4A 8EFbaWzGuZt1SRBvd8a2qiq9kj8g2FzzxqXsCluILwN9GNPZbYY2aXPemZcbbUYtqEA4yTRs vVvd68NXxIINTfXDlAYHr1DcCHvwqr+oZuG+J70zl1vVxjCl6BvdsG6VdMM0ag6RGhuRms81 EQj1oyFg3EBFJoJ/6vXV0rTTs+Yw6DsaFlNrM/xnGUf1hD5uh7utRyYfGpLssdJQzn/dVsRU iUPL5w35WZ/za2VJIs7Mv6f9DmxaRd6FrtCmc1GoXYspcj95ytrcFHKi+MviOUkhuEnOz/tB odfNf7h8Mkb5mVHONfWrFdIyF2ZqngD+Lx2YgITXdakyBq9WOFwGoyHblqQbO4PKxD38b7av /A==
Ken,

Here's a sample python program that should make 'foo.nc' from your
'foo.csv'. Just call the function Write_netcdf with whatever input and
output filename you want. There's some basic error checking, but more
might be needed depending on your data. Also attached is the resultant
netcdf file.

-Sourish

On 3/18/19 2:57 PM, Ken Mankoff wrote:
> On 2019-03-18 at 13:12 -0700, Sourish Basu <Sourish.Basu@xxxxxxxxxxxx> 
> wrote...
>> In your example dataset, there are five values for the time
>> coordinate. However, the values of x, y, lat, lon, and elev do not
>> seem to depend on the values of time. Is this true in general for your
>> data? If that's true (while still allowing x, y etc. to vary from year
>> to year, or file to file), that makes packaging even simpler.
> Correct. There are 6 header rows that *never* change: ID, lon,lat, x,y, elev. 
> There is 1 index column that is date. Then the data that is a function of 
> (ID,date) (or ((lon,lat),date), or ((x,y),date)) does change.
>
>   -k.

Attachment: foo.nc
Description: Cdf file

from netCDF4 import Dataset
import numpy as np
from datetime import datetime

def Read_CSV(file_name):
    # empty dictionary in which to return everything
    ret_dict = {'var_names': [], 'var_values': {}, 'time_values': [], 
'ret_array': None}
    # data types for the different 1D arrays (before the time variation starts)
    data_types = {'ID': np.int32, 'x': np.float32, 'y': np.float32, 'lat': 
np.float32, 'lon': np.float32, 'elev': np.float32}
    num_vars = len(data_types.keys())

    # read all the lines
    with open(file_name, 'r') as fid:
        all_lines = fid.readlines() # read all lines

    # the next num_vars lines contain scalar arrays, whose names have to be the 
first column
    num_vals = None
    for i in range(num_vars):
        relevant_line = all_lines[i]
        key = relevant_line.split(',')[0]
        values = np.array([float(x) for x in relevant_line.split(',')[1:]], 
dtype=data_types[key])
        ret_dict['var_names'].append(key)
        ret_dict['var_values'][key] = values
        # basic check to ensure that all lines have the same number of values
        if num_vals is None:
            # this is the first line, so get the record length
            num_vals = len(values)
        else:
            # check if subsequent lines have the same record length
            if len(values) != num_vals:
                raise RuntimeError('%s has %i records, expected %i'%(key, 
len(values), num_vals))
    all_lines = all_lines[num_vars:]

    # all lines henceforth have YYYY-MM-DD (or is it YYYY-DD-MM? can't tell 
from the provided file) as the first column
    # coding now assuming YYYY-MM-DD
    num_times = len(all_lines)
    ret_dict['ret_array'] = np.zeros((num_times, num_vals), np.float32)
    for i, line in enumerate(all_lines):
        time_val = datetime.strptime(line.split(',')[0], '%Y-%m-%d')
        var_val = np.array([float(x) for x in line.split(',')[1:]], 
dtype=np.float32)
        # check if the length of var_val matches the expected record length
        if len(var_val) != num_vals:
            raise RuntimeError('Time %s has %i records, expected 
%i'%(time_val.strftime('%Y-%m-%d'), len(var_val), num_vals))
        ret_dict['time_values'].append(time_val)
        ret_dict['ret_array'][i] = var_val

    return ret_dict

def Write_netcdf(netcdf_file='foo.nc', csv_file='foo.csv'):
    data = Read_CSV(csv_file)

    # compression (optional)
    comp_dict = {'zlib': True, 'shuffle': True, 'complevel': 6}

    with Dataset(netcdf_file, 'w') as fid:
        # create the dimensions
        fid.createDimension('times', None) # unlimited dimension
        fid.createDimension('record', None) # unlimited dimension
        fid.createDimension('time_tuple', 3)

        # write the auxiliary variables
        for var_name in data['var_names']:
            var_values = data['var_values'][var_name]
            v = fid.createVariable(var_name, var_values.dtype, ('record',), 
**comp_dict)
            v[:] = var_values

        # write the time values
        v = fid.createVariable('date_components', np.int16, ('times', 
'time_tuple'), **comp_dict)
        v[:] = np.array([d.timetuple()[:3] for d in data['time_values']], 
dtype=np.int16)

        # now write the 2D array of values
        v = fid.createVariable('data_values', data['ret_array'].dtype, 
('times', 'record'), **comp_dict)
        v[:] = data['ret_array']

Attachment: signature.asc
Description: OpenPGP digital signature

  • 2019 messages navigation, sorted by:
    1. Thread
    2. Subject
    3. Author
    4. Date
    5. ↑ Table Of Contents
  • Search the netcdfgroup archives: