dataset.py 43.4 KB
Newer Older
PIOLLE's avatar
PIOLLE committed
1 2
# -*- coding: utf-8 -*-
"""
Jeff Piollé's avatar
Jeff Piollé committed
3 4
Base class for any `cerbere` dataset. Built as a wrapper around a
:class:`~xarray.Dataset` object.
5 6
"""
from abc import ABC, abstractmethod
PIOLLE's avatar
PIOLLE committed
7
from collections import OrderedDict
8
import copy
PIOLLE's avatar
PIOLLE committed
9
import datetime
10
import glob
PIOLLE's avatar
PIOLLE committed
11
import logging
PIOLLE's avatar
PIOLLE committed
12
import os
Jeff Piollé's avatar
Jeff Piollé committed
13
from typing import (Any, Dict, Hashable, Iterable, Iterator, List,
14 15
                    Mapping, Optional, Sequence, Set, Tuple, Union, cast,
                    OrderedDict)
16
from urllib.parse import urlparse
PIOLLE's avatar
PIOLLE committed
17

18
from dateutil import parser
PIOLLE's avatar
PIOLLE committed
19
import numpy as np
PIOLLE's avatar
PIOLLE committed
20 21 22
import shapely.geometry
import xarray as xr

PIOLLE's avatar
PIOLLE committed
23 24
from ..cfconvention import default_global_attrs
from .field import Field
25 26 27 28 29

# mapper access modes
READ_ONLY = 'r'
WRITE_NEW = 'w'
READ_WRITE = 'r+'
PIOLLE's avatar
PIOLLE committed
30

31 32 33
# standard geolocation coordinates
GEOCOORDINATES = [u'time', u'lat', u'lon', u'z']
REQUIRED_GEOCOORDINATES = [u'time', u'lat', u'lon']
PIOLLE's avatar
PIOLLE committed
34

35 36 37 38 39 40
# CDM feature types
CDM_TYPES = {
    'swath': 'Swath',
    'grid': 'Grid'
}

PIOLLE's avatar
PIOLLE committed
41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
# common matching dimensions and fields in netCDF files
DIM_MATCHING = {
    'time': 'time',
    'lon': 'lon',
    'longitude': 'lon',
    'lat': 'lat',
    'latitude': 'lat',
    'x': 'x',
    'y': 'y',
    'mes': 'station',
    'station': 'station',
    'ni': 'cell',
    'cell': 'cell',
    'ra_size': 'cell',
    'col': 'cell',
    'nj': 'row',
    'row': 'row',
    'az_size': 'row',
    'depth': 'z',
    'height': 'z',
    'rows': 'row',
    'columns': 'cell',
    'NUMROWS': 'row',
    'NUMCELLS': 'cell',
    'across_track': 'cell',
    'along_track': 'row'
}

FIELD_MATCHING = {
    'time': 'time',
    'longitude': 'lon',
    'latitude': 'lat',
    'lon': 'lon',
    'lat': 'lat',
    'depth': 'z',
    'height': 'z'
}

TIME_COVERAGE_ATTRS = {
    'date': {
        'start': 'start_date',
        'end': 'stop_date'
    },
    'time': {
        'start': 'start_time',
        'end': 'stop_time'
    },
    'time_coverage': {
        'start': 'time_coverage_start',
        'end': 'time_coverage_end'
    },
    'time_coverage2': {
        'start': 'time_coverage_start',
        'end': 'time_coverage_stop'
    },
    'meas_time': {
        'start': 'first_meas_time',
        'end': 'lastst_meas_time'
    },
}

BBOX_CORNERS = {
    'latmin': [
        'southernmost_latitude', 'geospatial_lat_min', 'south_latitude'
    ],
    'latmax': [
        'northernmost_latitude', 'geospatial_lat_max', 'north_latitude'
    ],
    'lonmin': [
        'westernmost_longitude', 'geospatial_lon_min', 'west_longitude'
    ],
    'lonmax': [
        'easternmost_longitude', 'geospatial_lon_max', 'east_longitude'
    ]
}

Jeff Piollé's avatar
Jeff Piollé committed
117 118 119 120
class Dataset(ABC):
    """
    The `cerbere` dataset base class.

Jeff Piollé's avatar
Jeff Piollé committed
121 122
    A :class:`Dataset` object is internally built by composition over
    :class:`xarray.Dataset` objects.
Jeff Piollé's avatar
Jeff Piollé committed
123 124 125

    A :class:`Dataset` object can be created in different ways:
      * from a file (or list of files), giving its full path or URL
Jeff Piollé's avatar
Jeff Piollé committed
126 127 128
      * from a xarray :class:`xarray.Dataset` object
      * from a dict, using xarray syntax
      * from another :class:`Dataset` object
Jeff Piollé's avatar
Jeff Piollé committed
129 130 131 132 133 134

    Creating a Dataset from a file:

    >>> from cerbere.dataset.dataset import Dataset
    >>> dst = Dataset('./mydatafile.nc')

Jeff Piollé's avatar
Jeff Piollé committed
135
    Creating a Dataset from an xarray :class:`xarray.Dataset` object:
Jeff Piollé's avatar
Jeff Piollé committed
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
    The :mod:`xarray` object must have latitude, longitude and time coordinates
    with valid `cerbere` names (``lat``, ``lon``, ``time``):

    >>> import xarray as xr
    >>> import numpy as np
    >>> xrobj = xr.Dataset(
        coords={
            'lat': np.arange(0,10, 0.1),
            'lon': np.arange(5,15, 0.1),
            'time': np.full((100,), np.datetime64(('2010-02-03'), dtype='D'))
            },
        data_vars={'myvar': (('time',), np.ones(100))}
        )
    >>> dst = Dataset(xrobj)

Jeff Piollé's avatar
Jeff Piollé committed
151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194
    Creating a dataset from a dictionary:
    Using the same syntax as xarray (see:
    http://xarray.pydata.org/en/stable/generated/xarray.Dataset.from_dict.html#xarray.Dataset.from_dict).
    The provided dict must have latitude, longitude and time coordinates
    with valid `cerbere` names (``lat``, ``lon``, ``time``):

    >>> dst = Dataset(
            {'time': {'dims': ('time'), 'data': [datetime(2018, 1, 1)]},
             'lat': {'dims': ('lat'), 'data': np.arange(-80, 80, 1)},
             'lon': {'dims': ('lon',), 'data': np.arange(-180, 180, 1)},
             'myvar': {'dims': ('lat', 'lon',),
                       'data': np.ones(shape=(160, 360))}
             }
        )

    >>> dst = Dataset(
            {'coords': {
                'time': {'dims': ('time'), 'data': [datetime(2018, 1, 1)],
                         'attrs': {'units': 'seconds since 2001-01-01 00:00:00'}},
                'lat': {'dims': ('lat'), 'data': np.arange(-80, 80, 1)},
                'lon': {'dims': ('lon',), 'data': np.arange(-180, 180, 1)},
            },
            'attrs': {'gattr1': 'gattr_val'},
            'dims': ('time', 'lon', 'lat'),
            'data_vars': {'myvar': {'dims': ('lat', 'lon',),
                                    'data': np.ones(shape=(160, 360))}}}
        )

    :class:`Field` objects can also be mixed in:

    >>> field = Field(
            np.ones(shape=(160, 360)),
            'myvar',
            dims=('lat', 'lon',),
            attrs={'myattr': 'attr_val'}
        )
    >>> dst = Dataset(
            {'time': {'dims': ('time'), 'data': [datetime(2018, 1, 1)]},
             'lat': {'dims': ('lat'), 'data': np.arange(-80, 80, 1)},
             'lon': {'dims': ('lon',), 'data': np.arange(-180, 180, 1)},
             'myvar': field
             }
        )

Jeff Piollé's avatar
Jeff Piollé committed
195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210
    Args:

        dataset (str, list[str], xarray.Dataset): full path to a file from
            which to read the dataset content. It can be also a list of paths or
            a regular expression, if the mapper allows to open several files at
            once. Multiple files can only be opened in READ_ONLY mode.

        mode (enum, optional): access mode ('r', 'w', 'r+') when accessing a
            file.

        view (dict, optional): a dictionary where keys are dimension names
            and values are slices. A view can be set on a file, meaning
            that only the subset defined by this view will be accessible.
            This view is expressed as any subset (see :func:`get_values`).
            For example:

Jeff Piollé's avatar
Jeff Piollé committed
211 212
            >>> view = {'time':slice(0,0), 'lat':slice(200,300),
            >>>         'lon':slice(200,300)}
Jeff Piollé's avatar
Jeff Piollé committed
213 214 215 216 217 218 219 220 221 222 223

        dim_matching (dict): explicitly provides the matching between file
            native dimensions (keys) and their CF/cerbere standard name
            (values).

        field_matching (dict): explicitly provides the matching between file
            native fields (keys) and their CF/cerbere standard name (values).

        kwargs (dict): any argument to be passed on to the xarray
            open_dataset or open_mfdataset function.

PIOLLE's avatar
PIOLLE committed
224
    """
225
    def __init__(self,
Jeff Piollé's avatar
Jeff Piollé committed
226
                 dataset: Union[str, xr.Dataset, 'Dataset', dict],
227
                 mode=READ_ONLY,
PIOLLE's avatar
PIOLLE committed
228 229 230 231
                 view: Optional[Dict[str, slice]] = None,
                 dim_matching: Optional[Dict[str, str]] = DIM_MATCHING,
                 field_matching: Optional[Dict[str, str]] = FIELD_MATCHING,
                 attr_matching: Optional[Dict[str, str]] = None,
Jeff Piollé's avatar
Jeff Piollé committed
232
                 format: str = None,
233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249
                 **kwargs):
        """
        """
        if dataset is None:
            raise ValueError('an url or dataset object must be provided')

        object.__init__(self)

        self.original = None
        self.dataset = None

        # memorize opening arguments
        self.view = view
        self._mode = mode
        self._format = format
        self.args = kwargs

250
        self._url = None
251 252
        if isinstance(dataset, (str, list)):
            if mode != WRITE_NEW:
Jeff Piollé's avatar
Jeff Piollé committed
253
                self._url = self._analyse_url(dataset)
254 255 256 257 258 259 260
            else:
                self._url = dataset

            # check permissions
            if isinstance(self._url, list) and mode in [WRITE_NEW, READ_WRITE]:
                raise ValueError("A list of files is read only")

Jeff Piollé's avatar
Jeff Piollé committed
261
            self._open(**kwargs)
262

263 264
        elif isinstance(dataset, xr.Dataset):
            self.dataset = dataset
265

266
        elif isinstance(dataset, dict):
Jeff Piollé's avatar
Jeff Piollé committed
267 268
            self._create_from_dict(dataset)
        elif isinstance(dataset, Dataset):
PIOLLE's avatar
PIOLLE committed
269
            self.dataset = dataset
270 271 272 273 274
        else:
            raise TypeError(
                "Incorrect type {} to create a cerbere dataset object"
            )

PIOLLE's avatar
PIOLLE committed
275 276 277 278 279
        if not isinstance(dataset, Dataset):
            # translation tables
            self._std2native_field = field_matching
            self._std2native_dim = dim_matching
            self._std2native_attr = attr_matching
Jeff Piollé's avatar
Jeff Piollé committed
280

PIOLLE's avatar
PIOLLE committed
281 282
            if self.original is None:
                self.original = self.dataset.copy(deep=False)
283

PIOLLE's avatar
PIOLLE committed
284 285
            if self.is_empty():
                return
Jeff Piollé's avatar
Jeff Piollé committed
286

PIOLLE's avatar
PIOLLE committed
287 288 289
            if self.view is not None:
                self.original = self.original[self.view]
                self.dataset = self.dataset[self.view]
Jeff Piollé's avatar
Jeff Piollé committed
290

PIOLLE's avatar
PIOLLE committed
291 292
            # standardize dataset
            self._transform()
Jeff Piollé's avatar
Jeff Piollé committed
293

Jeff Piollé's avatar
Jeff Piollé committed
294
        # check geolocation coordinates are defined and valid
295 296 297 298 299
        if not self._check_geocoordinates():
            raise ValueError(
                "The dataset is not a valid observation set or is malformed."
            )

Jeff Piollé's avatar
Jeff Piollé committed
300 301 302 303 304 305 306 307 308 309
    def _create_from_dict(self, data):
        """
        Create the dataset from a dict of fields and attributes.
        """
        # addition to xarray : fields can be provided as Field object
        def to_dict(arr):
            return {'dims': arr.dims, 'data': arr.data, 'attrs': arr.attrs}

        for var in data.keys():
            if isinstance(data[var], Field):
Jeff Piollé's avatar
Jeff Piollé committed
310
                data[var] = to_dict(data[var].to_dataarray)
Jeff Piollé's avatar
Jeff Piollé committed
311

312
        if 'coords' in data.keys():
Jeff Piollé's avatar
Jeff Piollé committed
313 314
            for var, value in data['coords'].items():
                if isinstance(value, Field):
Jeff Piollé's avatar
Jeff Piollé committed
315
                    data[var] = to_dict(value.to_dataarray)
Jeff Piollé's avatar
Jeff Piollé committed
316 317 318
        if 'data_vars' in data.keys():
            for var, value in data['data_vars'].items():
                if isinstance(value, Field):
Jeff Piollé's avatar
Jeff Piollé committed
319
                    data[var] = to_dict(value.to_dataarray)
Jeff Piollé's avatar
Jeff Piollé committed
320 321 322 323

        # create a dataset
        self.dataset = xr.Dataset.from_dict(data)

324 325 326
    def _check_geocoordinates(self):
        """check the required geolocation coordinates are in the dataset"""
        coord_validity = all([
PIOLLE's avatar
PIOLLE committed
327
            coord in self.geocoords
328 329 330 331 332
            for coord in REQUIRED_GEOCOORDINATES
        ])

        if not coord_validity:
            for coord in REQUIRED_GEOCOORDINATES:
PIOLLE's avatar
PIOLLE committed
333
                if not coord in self.geocoords:
334 335 336 337 338
                    logging.warning(
                        "  => Missing coordinate var: {}".format(coord)
                    )
        return coord_validity

Jeff Piollé's avatar
Jeff Piollé committed
339 340
    def is_empty(self) -> bool:
        """Returns True if the dataset object contains no data yet"""
341
        return self.original.equals(xr.Dataset())
PIOLLE's avatar
PIOLLE committed
342

343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361
    def __is_remote(self):
        """check is the url corresponds to a remote file (http, ftp,...)"""
        try:
            result = urlparse(self._url)
            return all([result.scheme, result.netloc])
        except ValueError:
            return False

    def __enter__(self):
        return self

    def __exit__(self, type, value, traceback):
        self.close()

    def sync(self):
        """force physical writing of the content on disk."""
        raise NotImplementedError

    @classmethod
Jeff Piollé's avatar
Jeff Piollé committed
362
    def exists(cls, url: str) -> bool:
363 364 365 366 367 368 369 370 371 372
        """tests if `url` is an existing resource"""
        try:
            result = urlparse(url)
            if len(result.scheme) == 0:
                return os.path.exists(url)
            return all([result.scheme, result.netloc])
        except ValueError:
            return False

    @classmethod
Jeff Piollé's avatar
Jeff Piollé committed
373
    def _analyse_url(cls, url):
374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412
        """check and expand url if necessary"""
        if url is None:
            return

        if isinstance(url, list):
            for u in url:
                if not cls.exists(u):
                    raise IOError("{} does not exists".format(u))
            return url

        elif isinstance(url, str):
            try:
                if cls.exists(url):
                    return url
            except:
                # is it a file pattern?
                files = glob.glob(url)
                if len(files) > 0:
                    for u in files:
                        if not cls.exists(u):
                            raise IOError("{} does not exists".format(u))
                    return files
                raise IOError("no existing files found for {}".format(url))

    @classmethod
    def _get_time_format(cls):
        return '%Y-%m-%dT%H:%M:%SZ'

    def has_field(self, fieldname):
        """Return True if the field ``fieldname`` exists."""
        return fieldname in self.fieldnames

    def rename_field(self, field, newname):
        """Rename a field.

        Args:
            field (str): name of the field to be renamed
            newname (str): new name of the field
        """
PIOLLE's avatar
PIOLLE committed
413 414
        if isinstance(self.dataset, Dataset):
            return self.dataset.rename_field(field, newname)
415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443
        self.dataset.get_field(field).rename(newname)

    def _get_native_fieldname(self, fieldname):
        """Return the original name of a field."""
        try:
            return self._std2native_field[fieldname]
        except (TypeError, KeyError):
            return fieldname

    def _get_matching_dimname(self, dimname):
        """Return the equivalent name in the native format for a standard
        dimension.
        """
        try:
            return self._std2native_dim[dimname]
        except (TypeError, KeyError):
            return dimname

    def _get_standard_dimname(self, dimname):
        """Returns the equivalent standard dimension name for a dimension in the
        native format.
        """
        try:
            return {v: k for k, v in self._std2native_dim.items()}[dimname]
        except (TypeError, KeyError):
            return dimname

    @property
    def url(self) -> str:
PIOLLE's avatar
PIOLLE committed
444 445
        if isinstance(self.dataset, Dataset):
            return self.dataset._url
446 447 448 449
        return self._url

    @property
    def basename(self) -> str:
PIOLLE's avatar
PIOLLE committed
450
        return os.path.basename(self.url)
451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472

    @property
    def source(self) -> xr.Dataset:
        """Return the original dataset
        """
        return self.original

    def is_opened(self) -> bool:
        """Return True if a file is opened"""
        return self.original is not None

    def new_file(self):
        """Return True if the storage is opened in write mode"""
        return self._mode == WRITE_NEW

    def is_readonly(self):
        """Return True if the storage is opened in read only mode"""
        return self._mode == READ_ONLY

    @property
    def filesize(self):
        """return the dataset file size, in octets"""
PIOLLE's avatar
PIOLLE committed
473
        return os.path.getsize(self.url)
474 475 476 477 478 479

    @property
    def creation_date(self):
        """return the date the product was generated (NOT the file date!)"""
        return datetime.datetime.fromtimestamp(os.path.getctime(self._url))

PIOLLE's avatar
PIOLLE committed
480 481 482 483
    def _open_dataset(self, **kwargs):
        """
        Open a XArrayDataset compatible file (netCDF, ZArr,...)
        """
484
        logging.debug("opening {} in mode : {}".format(self._url, self._mode))
PIOLLE's avatar
PIOLLE committed
485 486 487 488 489 490 491 492 493 494 495
        if not self.new_file():
            try:
                options = kwargs
                for opt in ['mask_and_scale',
                            'decode_cf',
                            'decode_coords',
                            'decode_times',
                            'cache'
                            ]:
                    if opt not in options:
                        options[opt] = True
496
                if isinstance(self._url, list):
Jeff Piollé's avatar
Jeff Piollé committed
497
                    return xr.open_mfdataset(
498
                        self._url,
PIOLLE's avatar
PIOLLE committed
499 500 501 502
                        **options
                    )
                else:
                    return xr.open_dataset(
503
                        self._url,
PIOLLE's avatar
PIOLLE committed
504 505 506
                        **options
                    )
            except:
507
                logging.error("Could not read file: {}".format(self._url))
PIOLLE's avatar
PIOLLE committed
508
                raise
509 510
        else:
            return xr.Dataset()
PIOLLE's avatar
PIOLLE committed
511

Jeff Piollé's avatar
Jeff Piollé committed
512
    def _open(self, **kwargs):
PIOLLE's avatar
PIOLLE committed
513 514
        if self.is_opened():
            logging.warning("A file is already opened : {}".format(
515
                self._url)
PIOLLE's avatar
PIOLLE committed
516 517
            )

518
        if self._url is None:
PIOLLE's avatar
PIOLLE committed
519 520
            raise IOError('URL of the dataset to open was not provided')

Jeff Piollé's avatar
Jeff Piollé committed
521
        self.original = self._open_dataset(**kwargs)
PIOLLE's avatar
PIOLLE committed
522 523
        if self.original is not None:
            self.dataset = self.original.copy(deep=False)
PIOLLE's avatar
PIOLLE committed
524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553


        # @TODO
        # All this should be in __init__, not called directly (a la netCDF)
        # field and dim matching should not be attributes but only used in
        # __init__

        # case of a grid feature stored in the netcdf file with splitted time
        # field in two variables 'time' and 'dtime'. This is hidden to the user
        # and only a single time field is returned.
        #         if self.__is_2d_time_splitted_in_grid():
        #             fields.remove('dtime')

        #         if fieldname == 'time' and self.__is_2d_time_splitted_in_grid():
        #             native_fieldname = 'dtime'

        #         if self.center_on_greenwhich:
        #             # check if file content is indeed shifted
        #             lonvar = self.get_native_fieldname('lon')
        #             if len(self._handler[lonvar].dims) > 1:
        #                 raise Exception(
        #                     "This is not a grid or it is not in cylindrical projection"
        #                     ". Grid centering can not be applied here"
        #                 )
        #             firstlat = self._handler[lonvar][0]
        #             lastlat = self._handler[lonvar][-1]
        #             lastlat = lastlat if lastlat <= 180. else lastlat - 360.
        #             if firstlat < lastlat:
        #                 raise Exception("Grid does not seem to be de-centered")

Jeff Piollé's avatar
Jeff Piollé committed
554
        return self.original
PIOLLE's avatar
PIOLLE committed
555

PIOLLE's avatar
PIOLLE committed
556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571
    def _transform(self):
        """apply some transformation to original dataset to make it more
        standardized.
        """
        # rename coordinates and dimensions
        self._decode_dim_matching()
        self._decode_field_matching()

        self.dataset = self.dataset.rename(
            {**self._std2native_field,
             **self._std2native_dim}
        )

        # rename attributes
        self._decode_attributes()

PIOLLE's avatar
PIOLLE committed
572 573 574 575 576 577 578 579
    def _decode_dim_matching(self):
        """Build the correspondance table between standard and native dimension
        name.
        """
        if self._std2native_dim is None:
            return

        reduced = {}
Jeff Piollé's avatar
Jeff Piollé committed
580
        for dim in self.original.dims:
PIOLLE's avatar
PIOLLE committed
581 582 583
            dimdict = self._std2native_dim
            if dim in dimdict and dimdict[dim] not in self.dataset.dims:
                reduced[dim] = dimdict[dim]
PIOLLE's avatar
PIOLLE committed
584 585 586 587 588 589 590 591 592 593 594

        self._std2native_dim = reduced

    def _decode_field_matching(self):
        """Build the correspondance table between standard and native field
        name.
        """
        if self._std2native_field is None:
            return

        reduced = {}
Jeff Piollé's avatar
Jeff Piollé committed
595
        for field in self.original._variables:
PIOLLE's avatar
PIOLLE committed
596 597 598 599
            fielddict = self._std2native_field
            if (field in fielddict
                    and fielddict[field] not in self.dataset._variables):
                reduced[field] = fielddict[field]
PIOLLE's avatar
PIOLLE committed
600 601 602

        self._std2native_field = reduced

603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622
    @property
    def sizes(self):
        return OrderedDict(self.dataset.sizes)

    @property
    def dims(self):
        """
        Return the dimensions of the dataset.

        CF convention names are used whenever possible.

        Returns:
            OrderedDict<str, int>: a dictionary of the names and sizes of the
                dataset dimensions/
        """
        return OrderedDict(self.dataset.dims)

    @property
    def dimnames(self):
        return tuple(self.dims.keys())
PIOLLE's avatar
PIOLLE committed
623

624 625 626
    @property
    def dimsizes(self):
        return tuple(self.dims.values())
PIOLLE's avatar
PIOLLE committed
627 628

    def get_dimsize(self, dimname):
629 630 631 632 633 634 635 636 637 638
        """Return the size of a dimension.

        Args:
            dimname (str): name of the dimension.

        Returns:
            int: size of the dimension.
        """
        return self.dims[dimname]

639 640
    def get_field_dims(self, fieldname: str) -> OrderedDict[str, int]:
        """Mapping from dimension names to lengths."""
PIOLLE's avatar
PIOLLE committed
641 642
        if isinstance(self.dataset, Dataset):
            return self.dataset.get_field_dims(fieldname)
643
        return OrderedDict(self.dataset[fieldname].sizes)
644

645
    def get_field_dimnames(self, fieldname: str) -> Tuple[str]:
PIOLLE's avatar
PIOLLE committed
646 647
        if isinstance(self.dataset, Dataset):
            return self.dataset.get_field_dimnames(fieldname)
648
        return tuple(self.get_field_dims(fieldname).keys())
PIOLLE's avatar
PIOLLE committed
649

650
    def get_field_dimsizes(self, fieldname: str) -> Tuple[int]:
PIOLLE's avatar
PIOLLE committed
651 652
        if isinstance(self.dataset, Dataset):
            return self.dataset.get_field_dimsizes(fieldname)
653 654 655
        return tuple(self.get_field_dims(fieldname).values())

    @property
Jeff Piollé's avatar
Jeff Piollé committed
656 657
    def _varnames(self) -> List[str]:
        """List names of all the fields (including coordinates) of the dataset.
658
        """
PIOLLE's avatar
PIOLLE committed
659 660
        if isinstance(self.dataset, Dataset):
           return self.dataset._varnames
PIOLLE's avatar
PIOLLE committed
661 662
        return list(self.dataset.variables.keys())

663
    @property
Jeff Piollé's avatar
Jeff Piollé committed
664 665
    def coordnames(self) -> List[str]:
        """List of names of the coordinate fields of the dataset."""
PIOLLE's avatar
PIOLLE committed
666 667
        if isinstance(self.dataset, Dataset):
           return self.dataset._coordnames
PIOLLE's avatar
PIOLLE committed
668 669
        return list(self.dataset.coords.keys())

670 671

    @property
Jeff Piollé's avatar
Jeff Piollé committed
672 673
    def geocoords(self) -> List['xarray.DataArray']:
        """List of geolocation coordinates (as DataArray)"""
PIOLLE's avatar
PIOLLE committed
674
        if isinstance(self.dataset, Dataset):
Jeff Piollé's avatar
Jeff Piollé committed
675
            return self.dataset.geocoords
676 677 678 679 680 681 682 683 684 685 686 687 688
        return [
            _ for _ in self.dataset.coords if _ in GEOCOORDINATES
        ]

    @property
    def fieldnames(self):
        """Returns the names of the geophysical fields of the mapper.

        The coordinate field names are excluded from this list.

        Returns:
            list<string>: list of field names
        """
PIOLLE's avatar
PIOLLE committed
689 690
        if isinstance(self.dataset, Dataset):
           return self.dataset.fieldnames
PIOLLE's avatar
PIOLLE committed
691 692
        return list(self.dataset.data_vars.keys())

Jeff Piollé's avatar
Jeff Piollé committed
693
    def get_field(self, fieldname: str) -> 'Field':
694 695
        """
        Return the :class:`cerbere.dataset.field.Field` object corresponding to
Jeff Piollé's avatar
Jeff Piollé committed
696
        the requested field name.
697 698 699 700 701

        The :class:`cerbere.dataset.field.Field` class contains all the metadata
        describing a field (equivalent to a variable in netCDF).

        Args:
Jeff Piollé's avatar
Jeff Piollé committed
702
            fieldname: name of the field
703 704

        Returns:
Jeff Piollé's avatar
Jeff Piollé committed
705
            the corresponding Field object
706
        """
PIOLLE's avatar
PIOLLE committed
707 708 709
        if isinstance(self.dataset, Dataset):
           return self.dataset.get_field(fieldname)

Jeff Piollé's avatar
Jeff Piollé committed
710
        if fieldname not in self.fieldnames:
PIOLLE's avatar
PIOLLE committed
711
            raise ValueError("Unknown field {}".format(fieldname))
PIOLLE's avatar
PIOLLE committed
712

Jeff Piollé's avatar
Jeff Piollé committed
713
        return Field(self.dataset[fieldname], fieldname, dataset=self)
714

Jeff Piollé's avatar
Jeff Piollé committed
715
    def add_field(self, field: 'Field') -> None:
716 717 718
        """Add a field to the feature.

        Args:
Jeff Piollé's avatar
Jeff Piollé committed
719
            field: the field is provided as a :class:`Field` object
720
        """
PIOLLE's avatar
PIOLLE committed
721 722 723
        if isinstance(self.dataset, Dataset):
           return self.dataset.add_field(field)

724 725 726 727 728 729
        if field.name in self.fieldnames:
            raise Exception(
                "Field already existing in feature. Can not add {}"
                .format(field.name)
            )
        self.dataset = self.dataset.assign(
Jeff Piollé's avatar
Jeff Piollé committed
730
            {field.name: field.to_dataarray}
731
        )
Jeff Piollé's avatar
Jeff Piollé committed
732
        field._attach_dataset(self)
733

Jeff Piollé's avatar
Jeff Piollé committed
734 735 736 737 738 739
    def get_geocoord(self, coordname: str) -> Field:
        """
        Return the geolocation coordinate field with the requested name.

        Possible coordinate field names are 'lat', 'lon', 'time'.
        """
PIOLLE's avatar
PIOLLE committed
740 741 742
        if isinstance(self.dataset, Dataset):
           return self.dataset.get_geocoord(coordname)

Jeff Piollé's avatar
Jeff Piollé committed
743
        if coordname not in self.geocoords:
744
            raise ValueError(
Jeff Piollé's avatar
Jeff Piollé committed
745
                "Coordinate field {} is not exising: ".format(coordname)
746 747
            )

Jeff Piollé's avatar
Jeff Piollé committed
748
        return Field(self.dataset.coords[coordname], coordname, dataset=self)
749

750 751
    def get_fillvalue(self, fieldname):
        return self.get_field(fieldname).fill_value
PIOLLE's avatar
PIOLLE committed
752

753 754 755
    def get_values(
            self, fieldname: str, index=None, as_masked_array: bool=True,
            expand: bool=False, expand_dims=None, **kwargs):
PIOLLE's avatar
PIOLLE committed
756 757 758 759 760
        """Read the data of a field.

        Args:
            fieldname (str): name of the field which to read the data from

Jeff Piollé's avatar
Jeff Piollé committed
761 762 763 764 765 766
            index: any kind of xarray indexing compatible with
                :func:`xarray.DataArray.isel` selection method.

            padding: pad the result with fill values where slices are out of the
             field dimension limits. Default is False.

PIOLLE's avatar
PIOLLE committed
767 768 769 770
            as_masked_array (bool, optional): return the result as a masked
                array instead of a xarray DataArray. Default is True (may had some
                overhead).

771

PIOLLE's avatar
PIOLLE committed
772 773 774 775
        Return:
            MaskedArray: array of data read. Array type is the same as the
                storage type.
        """
PIOLLE's avatar
PIOLLE committed
776
        if isinstance(self.dataset, Dataset):
PIOLLE's avatar
PIOLLE committed
777 778 779 780 781
            kwargs['expand'] = expand
            kwargs['expand_dims'] = expand_dims
            return self.dataset.get_values(
               fieldname, index=index, as_masked_array=as_masked_array, **kwargs
            )
PIOLLE's avatar
PIOLLE committed
782

783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817
        # ignore indices not applicable to the field
        reduced_index = None
        if index is not None:
            reduced_index = copy.copy(index)
            for dim in index:
                if dim not in self.get_field_dimnames(fieldname):
                    reduced_index.pop(dim)
                    if not expand:
                        logging.warning(
                            'Indexing on {} was attempted despite it is not a '
                            'dimension of {} field. This indexing was ignored '
                            'but check your code'.format(dim, fieldname)
                        )

        values = self.dataset[fieldname].isel(reduced_index)

        # expand over requested dimensions
        if expand and expand_dims is not None:

            fdims = self.get_field_dimnames(fieldname)
            for dim in expand_dims[::-1]:
                if dim in fdims:
                    continue
                # get coordinate variable for this dimension
                for coord in self.geocoords:
                    if tuple(self.get_geocoord(coord).dimnames) == (dim,):
                        values = values.broadcast_like(
                            self.get_values(
                                coord, index=index,
                                as_masked_array=False, **kwargs
                            )
                        )
            if len(values.dims) > 0 & (values.dims != expand_dims):
                values = values.transpose(
                    *(list(expand_dims)), transpose_coords=True)
PIOLLE's avatar
PIOLLE committed
818 819

        if not as_masked_array:
820
            return values
PIOLLE's avatar
PIOLLE committed
821 822 823
        else:
            return values.to_masked_array(copy=False)

PIOLLE's avatar
PIOLLE committed
824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846
    def set_values(
            self,
            fieldname: str,
            values: Optional[np.ndarray],
            index: Optional[Mapping[str, slice]] = None,
            **kwargs):
        """set the values of a field.

        It is  possible to set only a subset of the field data array, using
        ``index``:

        >>> import numpy as np
        >>> ...
        >>> dst.set_values(
        >>>        'test',
        >>>        np.full((10, 5,), 2),
        >>>        {'x': slice(10, 20), 'y': slice(0, 5)}
        >>>        )

        See:
        Field.set_values

        """
PIOLLE's avatar
PIOLLE committed
847 848 849 850 851
        if isinstance(self.dataset, Dataset):
           return self.dataset.set_values(
               fieldname, values, index=index, **kwargs
           )

PIOLLE's avatar
PIOLLE committed
852 853 854 855 856
        if index is None:
            self.dataset[fieldname][:] = values
        else:
            self.dataset[fieldname][index] = values

PIOLLE's avatar
PIOLLE committed
857 858
    def extract(
            self,
Jeff Piollé's avatar
Jeff Piollé committed
859 860 861 862 863
            index: Optional[Mapping[str, slice]] = None,
            fields: Optional[List[str]] = None,
            padding: Optional[bool] = False,
            prefix: Optional[str] = None,
            deep: Optional[bool] = True,
PIOLLE's avatar
PIOLLE committed
864
            **kwargs
Jeff Piollé's avatar
Jeff Piollé committed
865 866 867
    ) -> 'Dataset':
        """
        Extract a subset of the dataset.
PIOLLE's avatar
PIOLLE committed
868

Jeff Piollé's avatar
Jeff Piollé committed
869 870
        If ``deep`` is False, returns only a view on the dataset, sharing the
        same data arrays in memory as the subsetted dataset.
PIOLLE's avatar
PIOLLE committed
871 872

        Args:
Jeff Piollé's avatar
Jeff Piollé committed
873 874
            index: any kind of xarray indexing compatible with
                :func:`xarray.DataArray.isel` selection method.
PIOLLE's avatar
PIOLLE committed
875

Jeff Piollé's avatar
Jeff Piollé committed
876 877 878
            padding: pad the result with fill values where slices are out of the
             field dimension limits. Default is False. ``deep`` must be set to
             True.
PIOLLE's avatar
PIOLLE committed
879

Jeff Piollé's avatar
Jeff Piollé committed
880 881
            fields: list of field names to extract. If None, all fields
                are extracted.
PIOLLE's avatar
PIOLLE committed
882

Jeff Piollé's avatar
Jeff Piollé committed
883 884
            prefix: add a prefix string to the field names of the returned
                subset.
PIOLLE's avatar
PIOLLE committed
885
        """
PIOLLE's avatar
PIOLLE committed
886 887 888 889 890 891
        if isinstance(self.dataset, Dataset):
           return self.dataset.extract(
               index=index, fields=fields, padding=padding,
               prefix=prefix, deep=deep, **kwargs
           )

PIOLLE's avatar
PIOLLE committed
892
        if fields is None:
Jeff Piollé's avatar
Jeff Piollé committed
893
            fields = self._varnames
PIOLLE's avatar
PIOLLE committed
894

Jeff Piollé's avatar
Jeff Piollé committed
895 896
        if index is None:
            subset = self.dataset[fields]
PIOLLE's avatar
PIOLLE committed
897
        else:
Jeff Piollé's avatar
Jeff Piollé committed
898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936
            subset = self.dataset[fields][index]

        if not deep and padding:
            raise ValueError(
                "padding is only possible with deep extraction"
            )
        if deep:
            subset = subset.copy(deep=True)
            if padding:
                # pad xarray coordinates and data vars
                coords = {
                    coord: xr.DataArray(
                        Field._pad_data(
                            self.dataset[coord],
                            subset[coord],
                            index
                        ),
                        dims=subset[coord].dims,
                        attrs=subset[coord].attrs
                    )
                    for coord in subset.coords
                }
                data_vars = {
                    dvar: xr.DataArray(
                        Field._pad_data(
                            self.dataset[dvar],
                            subset[dvar],
                            index
                        ),
                        dims=subset[dvar].dims,
                        attrs=subset[dvar].attrs
                    )
                    for dvar in subset.data_vars
                }
                subset = xr.Dataset(
                    coords=coords,
                    data_vars=data_vars,
                    attrs=subset.attrs
                )
PIOLLE's avatar
PIOLLE committed
937 938 939 940

        # rename with prefix extracted variables
        if prefix is not None:
            newfields = {
941
                _: '{}_{}'.format(prefix, _)
Jeff Piollé's avatar
Jeff Piollé committed
942
                for _ in subset.data_vars
PIOLLE's avatar
PIOLLE committed
943
            }
Jeff Piollé's avatar
Jeff Piollé committed
944
            subset.rename(newfields, inplace=True)
PIOLLE's avatar
PIOLLE committed
945 946
        return self.__class__(dataset=subset)

PIOLLE's avatar
PIOLLE committed
947
    def clone(
PIOLLE's avatar
PIOLLE committed
948
            self,
Jeff Piollé's avatar
Jeff Piollé committed
949
            fieldname: str,
PIOLLE's avatar
PIOLLE committed
950 951 952 953 954 955
            **kwargs
    ):
        """
        Create a copy of a field, limiting to a set of slices or indices, and
        padding out as required.

PIOLLE's avatar
PIOLLE committed
956 957
        See:
            Field.clone
PIOLLE's avatar
PIOLLE committed
958
        """
PIOLLE's avatar
PIOLLE committed
959
        return self.get_field(fieldname).clone(**kwargs)
PIOLLE's avatar
PIOLLE committed
960

961 962 963 964 965 966 967 968 969 970
    def get_field_attrs(self, fieldname):
        """Return the specific attributes of a field.

        Args:
            fieldname (str): name of the field.

        Returns:
            OrderedDict<str, object>: a dictionary where keys are the attribute
                names.
        """
PIOLLE's avatar
PIOLLE committed
971 972 973
        if isinstance(self.dataset, Dataset):
           return self.dataset.get_field_attrs(fieldname)

Jeff Piollé's avatar
Jeff Piollé committed
974
        return OrderedDict(self.dataset[fieldname].attrs)
PIOLLE's avatar
PIOLLE committed
975

976 977 978 979 980 981 982 983
    @property
    def attrs(self):
        """Returns the global attributes.

        Returns:
            OrderedDict<str, object>: Dictionary of global attributes on this
            dataset.
        """
PIOLLE's avatar
PIOLLE committed
984 985
        return self.dataset.attrs

986 987 988 989 990 991 992 993 994 995
    def get_attr(self, attr):
        """Returns the value of a global attribute.

        Args:
            attr(str): name of the global attribute.

        Returns:
            object: value of the requested attribute.
        """
        return self.dataset.attrs[attr]
PIOLLE's avatar
PIOLLE committed
996 997 998 999 1000 1001 1002 1003 1004 1005 1006

    def _decode_attributes(self):
        """decode the standard attributes"""
        self._decode_time_coverage()
        if self._std2native_attr is not None:
            for stdattr, attr in self._std2native_attr.items():
                self.dataset.attrs[stdattr] = self.dataset.attrs.pop(attr)

    def _decode_time_coverage(self):
        """decode time coverage attributes and update the translation table"""
        try:
Jeff Piollé's avatar
Jeff Piollé committed
1007 1008
            start = self._decode_time_coverage_attr('start')
            end = self._decode_time_coverage_attr('end')
PIOLLE's avatar
PIOLLE committed
1009 1010 1011 1012
            if not isinstance(start, datetime.datetime):
                self.dataset.attrs['time_coverage_start'] = parser.parse(start)
            if not isinstance(end, datetime.datetime):
                self.dataset.attrs['time_coverage_end'] = parser.parse(end)
Jeff Piollé's avatar
Jeff Piollé committed
1013 1014
        except (TypeError, ValueError):
            logging.error('Unexpected date values {}, {}'.format(start, end))
PIOLLE's avatar
PIOLLE committed
1015 1016 1017 1018 1019 1020 1021 1022 1023
            self.dataset.attrs['time_coverage_start'] = None
            self.dataset.attrs['time_coverage_end'] = None

    def _decode_time_coverage_attr(self, date):
        """Decode start/end time attributes

        Returns:
            datetime: start time of the data in file.
        """
PIOLLE's avatar
PIOLLE committed
1024
        attrs = self.dataset.attrs
PIOLLE's avatar
PIOLLE committed
1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049

        # time_coverage_start/end
        for attrstyle in ['time_coverage', 'time_coverage2', 'meas_time']:
            attrdate = TIME_COVERAGE_ATTRS[attrstyle][date]
            if attrdate in attrs:
                return attrs[attrdate]

        # start/end_time/date
        attrdate = TIME_COVERAGE_ATTRS['date'][date]
        if attrdate in attrs:
            if TIME_COVERAGE_ATTRS['time'][date] in attrs:
                # start/end_time/date combination
                attrdate = attrdate + 'T' + TIME_COVERAGE_ATTRS['time'][date]
            return attrs[attrdate]

    def get_start_time(self, inspect=False):
        """
        Args:
            inspect (bool): if not provided by a global attribute, `cerbere`
                will search in time field. May take some time so use with
                caution.

        Returns:
            datetime: minimum sensing time in data
        """
Jeff Piollé's avatar
Jeff Piollé committed
1050
        date = self.dataset.attrs['time_coverage_start']
PIOLLE's avatar
PIOLLE committed
1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066
        if date is None and inspect:
            date = self.get_times().min()
            # fill in for next time
            self.dataset.attrs['time_coverage_start'] = date
        return date

    def get_end_time(self, inspect=False):
        """
        Args:
            inspect (bool): if not provided by a global attribute, `cerbere`
                will search in time field. May take some time so use with
                caution.

        Returns:
            datetime: maximum sensing time in data
        """
Jeff Piollé's avatar
Jeff Piollé committed
1067
        date = self.dataset.attrs['time_coverage_end']
PIOLLE's avatar
PIOLLE committed
1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101
        if date is None and inspect:
            date = self.get_times().max()
            # fill in for next time
            self.dataset.attrs['time_coverage_end'] = date
        return date

    def get_bbox(self):
        '''
        return the bounding box of the feature, as a tuple
        (lonmin, latmin, lonmax, latmax)
        '''
        attrs = self.get_global_attributes()

        bbox = []
        for limit in ['lonmin', 'latmin', 'lonmax', 'latmax']:
            for att in BBOX_CORNERS[limit]:
                fatt = next((x for x in attrs if x == att), None)
                if fatt != None:
                    bbox.append(attrs[fatt])
        if len(bbox) != 4:
            return None

        # case ECMWF converted from Grib!
        #         if bbox[0] == 360. and bbox[2] == 0.:
        #             bbox[0] = 0.
        #             bbox[2] = 360.

        return shapely.geometry.box(*bbox)

    def _get_attr_value(self, att):
        """Return global attribute value or None if the attribute does not
        exists.
        """
        try:
PIOLLE's avatar
PIOLLE committed
1102
            return self.dataset.attrs[att]
PIOLLE's avatar
PIOLLE committed
1103 1104 1105 1106 1107 1108 1109 1110 1111
        except KeyError:
            return None

    def get_product_version(self):
        """return the product version"""
        return self._get_attr_value('product_version')

    def close(self):
        """Close file"""
PIOLLE's avatar
PIOLLE committed
1112 1113 1114
        if isinstance(self.dataset, Dataset):
            return self.dataset.close()

PIOLLE's avatar
PIOLLE committed
1115
        self._url = None
Jeff Piollé's avatar
Jeff Piollé committed
1116 1117 1118
        if self.original is not None:
            self.original.close()
            self.original = None
PIOLLE's avatar
PIOLLE committed
1119 1120 1121 1122 1123

    def _format_attrs(self, dataset):
        # convert datetime objects to string
        for att in dataset.attrs:
            if isinstance(att, datetime.datetime):
Jeff Piollé's avatar
Jeff Piollé committed
1124
                self.original.attrs[att] = dataset.attrs[att].strftime(
PIOLLE's avatar
PIOLLE committed
1125 1126 1127
                    self._get_time_format()
                )

PIOLLE's avatar
PIOLLE committed
1128
    def save(self,
1129
             dest=None,
PIOLLE's avatar
PIOLLE committed
1130
             format='NETCDF4',
PIOLLE's avatar
PIOLLE committed
1131 1132
             preprocess=None,
             attr_file=None
PIOLLE's avatar
PIOLLE committed
1133
             ):
PIOLLE's avatar
PIOLLE committed
1134 1135
        """
        Args:
1136
            dest (str, optional): save to a new file, whose path is provided in
PIOLLE's avatar
PIOLLE committed
1137 1138 1139
                this argument. Raise an exception if the file already exists.
            preprocess (bool, optional): apply a specific format transformation
                before saving (or default xarray formatting is used).
PIOLLE's avatar
PIOLLE committed
1140
        """
PIOLLE's avatar
PIOLLE committed
1141 1142 1143 1144 1145 1146 1147 1148
        if isinstance(self.dataset, Dataset):
            return self.dataset.save(
                dest=dest,
                format=format,
                preprocess=preprocess,
                attr_file=attr_file
            )

PIOLLE's avatar
PIOLLE committed
1149
        # save as a new file
1150 1151 1152 1153
        if dest is not None:
            if isinstance(dest, str):
                # save in a file
                if os.path.exists(dest):
1154
                    raise IOError("This file already exists")
1155 1156 1157 1158

                self._url = dest
                self._mode = WRITE_NEW
                self._format = format
Jeff Piollé's avatar
Jeff Piollé committed
1159
            elif isinstance(dest, Dataset):
1160
                if not dest.is_empty():
1161
                    raise IOError(
1162 1163 1164
                        "Can not save into a dataset that is not empty."
                    )
                if dest.is_readonly():
1165
                    raise IOError("destination dataset is read-only")
1166 1167
                # use dest dataset as formatter
                dest.dataset = self.dataset.copy()
PIOLLE's avatar
PIOLLE committed
1168 1169 1170 1171 1172 1173
                return dest.save(
                    format=format,
                    preprocess=preprocess,
                    attr_file=attr_file
                )

Jeff Piollé's avatar
Jeff Piollé committed
1174

PIOLLE's avatar
PIOLLE committed
1175 1176 1177
        # save in the file currently attached to the dataset
        else:
            if self._mode == READ_ONLY:
1178
                raise IOError(
PIOLLE's avatar
PIOLLE committed
1179 1180 1181
                    "Can not save a dataset open in read only mode or with no "
                    "url provided"
                )
PIOLLE's avatar
PIOLLE committed
1182

PIOLLE's avatar
PIOLLE committed
1183
        # no file is attached to this dataset yet
1184
        if self.is_empty():
Jeff Piollé's avatar
Jeff Piollé committed
1185
            self.original = self.dataset.copy(deep=False)
PIOLLE's avatar
PIOLLE committed
1186

PIOLLE's avatar
PIOLLE committed
1187 1188
        # synchronize dataset and original content, use original attribute and
        # variable naming conventions
Jeff Piollé's avatar
Jeff Piollé committed
1189 1190
        self._sync_internals()

PIOLLE's avatar
PIOLLE committed
1191 1192
        # apply new formatting rules
        if preprocess is not None:
1193 1194
            saved_dataset = preprocess(self.original)
        else:
PIOLLE's avatar
PIOLLE committed
1195
            saved_dataset = self._convert_format(attr_file=attr_file)
PIOLLE's avatar
PIOLLE committed
1196 1197

        # save to chosen format
Jeff Piollé's avatar
Jeff Piollé committed
1198
        if 'NETCDF' in self._format:
PIOLLE's avatar
PIOLLE committed
1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213

            # ensure proper type in output attributes for the considered format
            self._format_nc_attrs(saved_dataset)

            for v in saved_dataset.variables:
                if 'zlib' not in saved_dataset[v].encoding:
                    saved_dataset[v].encoding['zlib'] = True
                if 'complevel' not in saved_dataset[v].encoding:
                    saved_dataset[v].encoding['complevel'] = 4
                if '_FillValue' not in saved_dataset[v].encoding:
                    if '_FillValue' in saved_dataset[v].attrs:
                        fillv = saved_dataset[v].attrs.pop('_FillValue')
                    else:
                        fillv = np.ma.maximum_fill_value(saved_dataset[v].dtype)
                    saved_dataset[v].encoding['_FillValue'] = fillv
1214 1215
            saved_dataset.to_netcdf(
                path=self._url,
Jeff Piollé's avatar
Jeff Piollé committed
1216
                mode={'r+': 'a', 'w': 'w'}[self._mode],
PIOLLE's avatar
PIOLLE committed
1217 1218
                format=self._format,
                engine='netcdf4'
Jeff Piollé's avatar
Jeff Piollé committed
1219
            )
PIOLLE's avatar
PIOLLE committed
1220

PIOLLE's avatar
PIOLLE committed
1221
        else:
PIOLLE's avatar
PIOLLE committed
1222
            logging.error('Unknown output format : {}'.format(self._format))
PIOLLE's avatar
PIOLLE committed
1223 1224
            raise NotImplementedError

Jeff Piollé's avatar
Jeff Piollé committed
1225
    def _sync_internals(self):
PIOLLE's avatar
PIOLLE committed
1226 1227 1228
        """Update the mapper version of the dataset. Add the missing arrays
        and update changed attributes
        """
Jeff Piollé's avatar
Jeff Piollé committed
1229 1230 1231
        # first add new fields not yet saved on file
        newfields = []
        for f in self.dataset.data_vars.keys():
1232
            if self._get_native_fieldname(f) not in self.original.data_vars:
Jeff Piollé's avatar
Jeff Piollé committed
1233 1234 1235 1236
                newfields.append(f)
        if len(newfields) > 0:
            tmpdst = self.dataset[newfields].rename(
                **{f: self.get_matching_fieldname(f) for f in newfields},
1237
                **{d: self._get_matching_dimname(d) for d in self.dataset.dims}
Jeff Piollé's avatar
Jeff Piollé committed
1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251
            )
            self.original.merge(tmpdst, inplace=True)

        # add missing attributes
        for attr in self.dataset.attrs:
            attrval = self._format_attr(self.dataset.attrs[attr])
            if attr not in self.original.attrs:
                self.original.attrs[attr] = attrval
            else:
                if attrval != self.original.attrs[attr]:
                    logging.debug("Change attribute {} with value {}"
                                  .format(attr, attrval))
                    self.original.attrs[attr] = attrval

PIOLLE's avatar
PIOLLE committed
1252
    def _convert_format(self, attr_file=None):
PIOLLE's avatar
PIOLLE committed
1253 1254 1255 1256 1257 1258
        """Implement specific formatting rules to a dataset.

        Used before saving the dataset to match some specific format
        requirements when writing the dataset on file.
        """
        dataset = self.original.copy()
PIOLLE's avatar
PIOLLE committed
1259 1260 1261 1262 1263 1264

        # add attributes from attribute file template
        attrs = default_global_attrs(path=attr_file)
        attrs.update(dataset.attrs)
        dataset.attrs = attrs

PIOLLE's avatar
PIOLLE committed
1265 1266 1267
        return dataset

    @classmethod
PIOLLE's avatar
PIOLLE committed
1268
    def _format_nc_attrs(cls, dataset, *args, **kwargs):
PIOLLE's avatar
PIOLLE committed
1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289
        """format the attributes in an acceptable type for netCDF"""
        # global attrs
        for att in dataset.attrs:
            # remove None attributes
            if dataset.attrs[att] is None:
                dataset.attrs.pop(att)
                continue
            # convert datetime objects to string
            dataset.attrs[att] = cls._format_attr(dataset.attrs[att])

        # variable attrs
        for varname in dataset._variables:
            var = dataset._variables[varname]
            invalid_attrs = []
            for att in var.attrs:
                if var.attrs[att] is None:
                    invalid_attrs.append(att)
                    continue
            for att in invalid_attrs:
                var.attrs.pop(att)

Jeff Piollé's avatar
Jeff Piollé committed
1290 1291 1292 1293
    @classmethod
    def _format_attr(cls, attrval):
        if isinstance(attrval, datetime.datetime):
            return attrval.strftime(cls._get_time_format())
PIOLLE's avatar
PIOLLE committed
1294
        elif attrval is None:
Jeff Piollé's avatar
Jeff Piollé committed
1295 1296
            return ''
        return attrval
1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323

    def get_product_version(self):
        """return the product version"""
        raise NotImplementedError

    def get_collection_id(self):
        """return the identifier of the product collection"""
        raise NotImplementedError

    def get_naming_authority(self):
        globalattrs = self.read_global_attributes()
        auth = None
        if 'Conventions' in globalattrs:
            auth = self.read_global_attribute('Conventions')
        else:
            auth = None
        del globalattrs
        return auth

    def get_field_handler(self, fieldname):
        if fieldname in self.get_fieldnames()\
                or self.has_geolocation_field(fieldname):
            status = SAVED
        else:
            status = NOTSAVED
        descr = FieldHandler(self, fieldname, status=status)
        return descr