tab-dataset.tab_dataset.dataset_interface

The dataset_interface module is part of the tab-dataset package.
It contains the classes DatasetInterface for Dataset entities.
For more information, see the user guide or the github repository.
View Source
  1# -*- coding: utf-8 -*-
  2"""
  3The `dataset_interface` module is part of the `tab-dataset` package.
  4
  5It contains the classes `DatasetInterface` for Dataset entities.
  6
  7For more information, see the 
  8[user guide](https://loco-philippe.github.io/tab-dataset/docs/user_guide.html) 
  9or the [github repository](https://github.com/loco-philippe/tab-dataset).
 10"""
 11
 12import csv
 13import math
 14import json
 15import xarray
 16import numpy as np
 17import matplotlib.pyplot as plt
 18from tabulate import tabulate
 19
 20from json_ntv.ntv import NtvList, NtvJsonEncoder
 21from tab_dataset.cfield import Cutil
 22from tab_dataset.cdataset import DatasetError
 23
 24
 25class DatasetInterface:
 26    '''this class includes Dataset methods :
 27
 28    - `DatasetInterface.json`
 29    - `DatasetInterface.plot`
 30    - `DatasetInterface.to_ntv`
 31    - `DatasetInterface.to_csv`
 32    - `DatasetInterface.to_file`
 33    - `DatasetInterface.to_xarray`
 34    - `DatasetInterface.to_dataframe`
 35    - `DatasetInterface.view`
 36    - `DatasetInterface.vlist`
 37    - `DatasetInterface.voxel`
 38    '''
 39
 40    def json(self, **kwargs):
 41        '''
 42        Return json dict, json string or Cbor binary.
 43
 44        *Parameters (kwargs)*
 45
 46        - **encoded** : boolean (default False) - choice for return format
 47        (string/bytes if True, dict else)
 48        - **format**  : string (default 'json')- choice for return format (json, cbor)
 49        - **codif** : dict (default ES.codeb). Numerical value for string in CBOR encoder
 50        - **modecodec** : string (default 'optimize') - if 'full', each index is with a full codec
 51        if 'default' each index has keys, if 'optimize' keys are optimized,
 52        if 'dict' dict format is used, if 'nokeys' keys are absent
 53        - **name** : boolean (default False) - if False, default index name are not included
 54        - **geojson** : boolean (default False) - geojson for LocationValue if True
 55
 56        *Returns* : string or dict'''
 57        return self.to_obj(**kwargs)
 58
 59    def plot(self, varname=None, idxname=None, order=None, line=True, size=5,
 60             marker='o', maxlen=20):
 61        '''
 62        This function visualize data with line or colormesh.
 63
 64        *Parameters*
 65
 66        - **varname** : string (default none) - Name of the variable to use. If None,
 67        first lvarname is used.
 68        - **line** : Boolean (default True) - Choice line or colormesh.
 69        - **order** : list (defaut None) - order of the axes (x, y, hue or col)
 70        - **size** : int (defaut 5) - plot size
 71        - **marker** : Char (default 'o') - Symbol for each point.
 72        - **maxlen** : Integer (default 20) - maximum length for string
 73
 74        *Returns*
 75
 76        - **None**  '''
 77        if not self.consistent:
 78            return None
 79        if idxname:
 80            idxname = [name for name in idxname if len(
 81                self.nindex(name).codec) > 1]
 82        #xar = self.to_xarray(numeric=True, varname=varname, idxname=idxname, lisfunc=[util.cast],##
 83        xar = self.to_xarray(numeric=True, varname=varname, idxname=idxname, lisfunc=None,
 84                             dtype='str', npdtype='str', maxlen=maxlen, coord=True)
 85        if not order:
 86            order = [0, 1, 2]
 87
 88        if len(xar.dims) == 1:
 89            xar.plot.line(x=xar.dims[0]+'_row', size=size, marker=marker)
 90        elif len(xar.dims) == 2 and line:
 91            xar.plot.line(x=xar.dims[order[0]] + '_row',
 92                          xticks=list(xar.coords[xar.dims[0]+'_row'].values),
 93                          hue=xar.dims[order[1]], size=size, marker=marker)
 94        elif len(xar.dims) == 2 and not line:
 95            xar.plot(x=xar.dims[order[0]]+'_row', y=xar.dims[order[1]]+'_row',
 96                     xticks=list(xar.coords[xar.dims[order[0]]+'_row'].values),
 97                     yticks=list(xar.coords[xar.dims[order[1]]+'_row'].values),
 98                     size=size)
 99        elif len(xar.dims) == 3 and line:
100            xar.plot.line(x=xar.dims[order[0]] + '_row', col=xar.dims[order[1]],
101                          xticks=list(
102                xar.coords[xar.dims[order[0]]+'_row'].values),
103                hue=xar.dims[order[2]], col_wrap=2, size=size, marker=marker)
104        elif len(xar.dims) == 3 and not line:
105            xar.plot(x=xar.dims[order[0]]+'_row', y=xar.dims[order[1]]+'_row',
106                     xticks=list(xar.coords[xar.dims[order[0]]+'_row'].values),
107                     yticks=list(xar.coords[xar.dims[order[1]]+'_row'].values),
108                     col=xar.dims[order[2]], col_wrap=2, size=size)
109        plt.show()
110        return {xar.dims[i]: list(xar.coords[xar.dims[i]].values) for i in range(len(xar.dims))}
111
112    def to_csv(self, filename, optcsv={'quoting': csv.QUOTE_NONNUMERIC}, **kwargs):
113        '''
114        Generate csv file to display data.
115
116        *Parameters*
117
118        - **filename** : string - file name (with path)
119        - **optcsv** : parameter for csv.writer
120
121        *Parameters (kwargs)*
122
123        - **name=listcode** : element (default None) - eg location='ns'
124            - listcode : string with Code for each index (j: json, n: name, s: simple).
125            - name : name of the index
126        - **lenres** : Integer (default : 0) - Number of raws (all if 0)
127        - **header** : Boolean (default : True) - If True, first line with names
128        - **optcsv** : parameter for csv.writer
129        - **ifunc** : function (default None) - function to apply to indexes
130        - **other kwargs** : parameter for ifunc
131
132        *Returns* : size of csv file '''
133        size = 0
134        if not optcsv:
135            optcsv = {}
136        tab = self._to_tab(**kwargs)
137        with open(filename, 'w', newline='', encoding="utf-8") as csvfile:
138            writer = csv.writer(csvfile, **optcsv)
139            for lign in tab:
140                size += writer.writerow(lign)
141        return size
142
143    def to_dataframe(self, info=False, idx=None, fillvalue='?', fillextern=True,
144                     lisfunc=None, name=None, numeric=False, npdtype=None, **kwargs):
145        '''
146        Complete the Object and generate a Pandas DataFrame with the dimension define by idx.
147
148        *Parameters*
149
150        - **info** : boolean (default False) - if True, add _dict attributes to attrs Xarray
151        - **idx** : list (default none) - list of idx to be completed. If [],
152        self.primary is used.
153        - **fillvalue** : object (default '?') - value used for the new extval
154        - **fillextern** : boolean(default True) - if True, fillvalue is converted to internal value
155        - **lisfunc** : function (default none) - list of function to apply to indexes before export
156        - **name** : string (default None) - DataArray name. If None, variable name
157        - **numeric** : Boolean (default False) - Generate a numeric DataArray.Values.
158        - **npdtype** : string (default None) - numpy dtype for the DataArray ('object' if None)
159        - **kwargs** : parameter for lisfunc
160
161        *Returns* : pandas.DataFrame '''
162        if self.consistent:
163            return self.to_xarray(info=info, idx=idx, fillvalue=fillvalue,
164                                  fillextern=fillextern, lisfunc=lisfunc, name=name,
165                                  numeric=numeric, npdtype=npdtype, **kwargs
166                                  ).to_dataframe(name=name)
167        return None
168
169    def to_file(self, filename, **kwargs):
170        '''Generate file to display data.
171
172         *Parameters (kwargs)*
173
174        - **filename** : string - file name (with path)
175        - **kwargs** : see 'to_ntv' parameters
176
177        *Returns* : Integer - file lenght (bytes)  '''
178        option = {'format': 'cbor', 'modecodec': 'optimize'} | kwargs | {
179            'encoded': True}
180        data = self.to_ntv(modecodec=option['modecodec']).to_obj(**option)
181        if option['format'] == 'cbor':
182            size = len(data)
183            with open(filename, 'wb') as file:
184                file.write(data)
185        else:
186            size = len(bytes(data, 'UTF-8'))
187            with open(filename, 'w', newline='', encoding="utf-8") as file:
188                file.write(data)
189        return size
190
191    def to_ntv(self, modecodec='optimize', def_type='json', name=False):
192        '''Return a Ntv tab value (whithout name) .
193
194        *Parameters (kwargs)*
195
196        - **modecodec** : string (default 'optimize') - if 'full', each index is with a full codec
197        if 'default' each index has keys, if 'optimize' keys are optimized,
198        if 'dict' dict format is used, if 'nokeys' keys are absent
199        - **def_type** : string (default 'json') - default ntv_type for NtvList or NtvSet
200        - **name** : boolean (default False) - if False, default index name are not included
201
202
203        *Returns* : Ntv object'''
204        idxname = [name or iname != 'i' + str(i)
205                   for i, iname in enumerate(self.lname)]
206        if modecodec != 'optimize':
207            lis = [index.to_ntv(modecodec=modecodec, name=iname)
208                   for index, iname in zip(self.lindex, idxname)]
209        else:
210            lis = []
211            anafields = self.anafields
212            for idx, iname, anafld in zip(self.lindex, idxname, anafields):
213                coef = Cutil.encode_coef(idx.keys)
214                parent = anafld.p_derived.view('index')
215                if anafld.category == 'unique':
216                    lis.append(idx.to_ntv(name=iname))
217                elif anafld.category == 'coupled':
218                    idx_coup = idx.setkeys(
219                        self.lindex[parent].keys, inplace=False)
220                    lis.append(idx_coup.to_ntv(parent=parent, name=iname))
221                elif coef:
222                    lis.append(idx.to_ntv(keys=[coef], name=iname))
223                elif parent == -1:  # cat='variable' or 'secondary'
224                    if idx.keys == list(range(len(self))):
225                        lis.append(idx.to_ntv(modecodec='full', name=iname))
226                    else:
227                        lis.append(idx.to_ntv(modecodec='default', name=iname))
228                else:  # derived
229                    if len(self.lindex[parent].codec) == len(self):
230                        lis.append(idx.to_ntv(modecodec='default', name=iname))
231                    else:  # derived
232                        keys = idx.derkeys(self.lindex[parent])
233                        lis.append(idx.to_ntv(
234                            keys=keys, parent=parent, name=iname))
235        return NtvList(lis, self.name)
236
237    def to_xarray(self, info=False, idxname=None, varname=None, fillvalue='?',
238                  fillextern=True, lisfunc=None, name=None, numeric=False,
239                  npdtype=None, attrs=None, coord=False, **kwargs):
240        '''
241        Complete the Object and generate a Xarray DataArray with the dimension define by idx.
242        Only the first variable is incuded.
243
244        *Parameters*
245
246        - **info** : boolean (default False) - if True, add _dict attributes to attrs Xarray
247        - **idxname** : list (default none) - list of choosen primary fields. If None,
248        self.primary is used.
249        - **varname** : string (default none) - Name of the variable to use. If None,
250        first lvarname is used.
251        - **fillvalue** : object (default '?') - value used for the new extval
252        - **fillextern** : boolean(default True) - if True, fillvalue is converted to internal value
253        - **lisfunc** : function (default none) - list of function to apply to indexes before export
254        - **name** : string (default None) - DataArray name. If None, variable name
255        - **numeric** : Boolean (default False) - Generate a numeric DataArray.Values.
256        - **npdtype** : string (default None) - numpy dtype for the DataArray ('object' if None)
257        - **attrs** : dict (default None) - attributes for the DataArray
258        - **coord** : boolean (default False) - if True, add derivated coords
259        - **kwargs** : parameter for lisfunc
260
261        *Returns* : DataArray '''
262        option = {'dtype': None} | kwargs
263        if not self.consistent:
264            raise DatasetError("Dataset not consistent")
265        if idxname is None or idxname == []:
266            idxname = self.primaryname
267        ilf = self.full(idxname=idxname, varname=varname, fillvalue=fillvalue,
268                        fillextern=fillextern, inplace=False)
269        ilf.setcanonorder()
270        if not varname and len(ilf.lvarname) != 0:
271            varname = ilf.lvarname[0]
272        if not varname in ilf.lname:
273            ivar = -1
274        else:
275            ivar = ilf.lname.index(varname)
276        if isinstance(lisfunc, list) and len(lisfunc) == 1:
277            lisfunc = lisfunc * ilf.lenindex
278        elif isinstance(lisfunc, list) and len(lisfunc) != ilf.lenindex:
279            lisfunc = [None] * ilf.lenindex
280        elif not isinstance(lisfunc, list):
281            funcvar = lisfunc
282            lisfunc = [None] * ilf.lenindex
283            if ivar != -1:
284                lisfunc[ivar] = funcvar
285        lisfuncname = dict(zip(ilf.lname, lisfunc))
286        coords = ilf._xcoord(idxname, ivar, lisfuncname, coord, **option)
287        dims = idxname
288        if numeric:
289            #lisfunc[ivar] = util.cast
290            fillvalue = math.nan
291            npdtype = 'float'
292            option['dtype'] = 'float'
293        if ivar == -1:
294            data = self.field(list(range(len(ilf)))).to_numpy(npdtype='int')\
295                .reshape([len(ilf.nindex(name).codec) for name in idxname])
296        else:
297            data = ilf.lindex[ivar]\
298                .to_numpy(func=lisfunc[ivar], npdtype=npdtype, **option)\
299                .reshape([len(ilf.nindex(name).codec) for name in idxname])
300        if not name and ivar == -1:
301            name = ilf.name
302        elif not name:
303            name = ilf.lname[ivar]
304        if not isinstance(attrs, dict):
305            attrs = {}
306        for nam in ilf.lunicname:
307            attrs[nam] = ilf.nindex(nam).codec[0]
308        if info:
309            attrs |= ilf.indexinfos()
310        #print(data, coords, dims, attrs, name)
311        return xarray.DataArray(data, coords, dims, attrs=attrs, name=name)
312
313    def voxel(self, idxname=None, varname=None):
314        '''
315        Plot not null values in a cube with voxels and return indexes values.
316
317        *Parameters*
318
319        - **idxname** : list (default none) - list of idx to be completed. If None,
320        self.primary is used.
321        - **varname** : string (default none) - Name of the variable to use. If None,
322        first lvarname is used.
323
324        *Returns* : **dict of indexes values**
325        '''
326        if not self.consistent:
327            return None
328        if idxname is None or idxname == []:
329            idxname = self.primaryname
330        if varname is None and self.lvarname:
331            varname = self.lvarname[0]
332        if len(idxname) > 3:
333            raise DatasetError('number of idx > 3')
334        if len(idxname) == 2:
335            self.addindex(self.field('null', ' ', keys=[0]*len(self)))
336            idxname += [' ']
337        elif len(idxname) == 1:
338            self.addindex(self.field('null', ' ', keys=[0]*len(self)))
339            self.addindex(self.field('null', '  ', keys=[0]*len(self)))
340            idxname += [' ', '  ']
341        xar = self.to_xarray(idxname=idxname, varname=varname, fillvalue='?',
342                             fillextern=False, lisfunc=Cutil.is_not_equal, tovalue='?')
343        axe = plt.figure().add_subplot(projection='3d')
344        axe.voxels(xar, edgecolor='k')
345        axe.set_xticks(np.arange(self.idxlen[self.idxname.index(xar.dims[0])]))
346        axe.set_yticks(np.arange(self.idxlen[self.idxname.index(xar.dims[1])]))
347        axe.set_zticks(np.arange(self.idxlen[self.idxname.index(xar.dims[2])]))
348        axe.set(xlabel=xar.dims[0][:8],
349                ylabel=xar.dims[1][:8],
350                zlabel=xar.dims[2][:8])
351        plt.show()
352        self.delindex([' ', '  '])
353        return {xar.dims[i]: list(xar.coords[xar.dims[i]].values)
354                for i in range(len(xar.dims))}
355
356    def view(self, **kwargs):
357        '''
358        Generate tabular list to display data.
359
360        *Parameters (kwargs)*
361
362        - **name=listcode** : element (default None) - eg location='ns'
363            - listcode : string with Code for each index (j: json, n: name, s: simple).
364            - name : name of the index
365        - **defcode** : String (default : 'j') - default list code (if 'all' is True)
366        - **all** : Boolean (default : True) - 'defcode apply to all indexes or none
367        - **lenres** : Integer (default : 0) - Number of raws (all if 0)
368        - **header** : Boolean (default : True) - First line with names
369        - **width** : Integer (default None) - Number of characters displayed for each
370        attribute (all if None)
371        - **ifunc** : function (default None) - function to apply to indexes
372        - **tabulate params** : default 'tablefmt': 'simple', 'numalign': 'left',
373        'stralign': 'left', 'floatfmt': '.3f' - See tabulate module
374        - **other kwargs** : parameter for ifunc
375
376        *Returns* : list or html table (tabulate format) '''
377        opttab = {'defcode': 'j', 'all': True, 'lenres': 0, 'header': True}
378        optview = {'tablefmt': 'simple', 'numalign': 'decimal',
379                   'stralign': 'left', 'floatfmt': '.2f'}
380        option = opttab | optview | kwargs
381        tab = self._to_tab(**option)
382        width = ({'width': None} | kwargs)['width']
383        if width:
384            #tab = [[(lambda x: x[:width] if isinstance(x, str) else x)(val)
385            tab = [[val[:width] if isinstance(val, str) else val
386                    for val in lig] for lig in tab]
387        return tabulate(tab, headers='firstrow', **{k: option[k] for k in optview})
388
389    def vlist(self, *args, func=None, index=-1, **kwargs):
390        '''
391        Apply a function to an index and return the result.
392
393        *Parameters*
394
395        - **func** : function (default none) - function to apply to extval or extidx
396        - **args, kwargs** : parameters for the function
397        - **index** : integer - index to update (index=-1 for first variable)
398
399        *Returns* : list of func result'''
400        if index == -1 and self.lvar:
401            return self.lvar[0].vlist(func, *args, **kwargs)
402        if index == -1 and self.lenindex == 1:
403            index = 0
404        return self.lindex[index].vlist(func, *args, **kwargs)
405
406    # %%internal
407
408    def _to_tab(self, **kwargs):
409        ''' data preparation (dict of dict) for view or csv export.
410        Representation is included if :
411            - code is definie in the name element of the field
412            - or code is defined in 'defcode' element and 'all' element is True
413
414        *Parameters (kwargs)*
415
416        - **name=listcode** : element (default None) - eg location='ns'
417            - listcode : string with Code for each index (j: json, n: name, s: simple, f: ifunc).
418            - name : name of the index
419        - **defcode** : String (default : 'j') - default list code (if 'all' is True)
420        - **all** : Boolean (default : True) - 'defcode apply to all indexes or none
421        - **lenres** : Integer (default : 0) - Number of raws (all if 0)
422        - **ifunc** : function (default None) - function to apply to indexes
423        - **other kwargs** : parameter for ifunc'''
424
425        option = {'defcode': 'j', 'all': True, 'lenres': 0, 'ifunc': None,
426                  'header': True} | kwargs
427        tab = []
428        reslist = []
429        diccode = {'j': '', 'n': 'name-', 's': 'smpl-', 'f': 'func-'}
430        if option['header']:
431            for name in self.lname:
432                opt = name if name in option else 'defcode'
433                if opt != 'defcode' or option['all']:
434                    for char, code in diccode.items():
435                        if char in option[opt]:
436                            reslist.append(code + name)
437            tab.append(reslist)
438        lenres = option['lenres']
439        if lenres == 0:
440            lenres = len(self)
441        for i in range(min(lenres, len(self))):
442            reslist = []
443            for name in self.lname:
444                opt = name if name in option else 'defcode'
445                if opt != 'defcode' or option['all']:
446                    for char, code in diccode.items():
447                        if char in option[opt]:
448                            val = self.nindex(name).values[i]
449                            if char == 'j':
450                                #reslist.append(util.cast(val, dtype='json'))
451                                reslist.append(json.dumps(
452                                    self.field.s_to_e(val), cls=NtvJsonEncoder))
453                            elif char == 'n':
454                                reslist.append(self.field.i_to_name(val))
455                            elif char == 's':
456                                reslist.append(json.dumps(
457                                    self.field.s_to_e(val), cls=NtvJsonEncoder))
458                            elif char == 'f':
459                                reslist.append(Cutil.funclist(
460                                    val, option['ifunc'], **kwargs))
461            tab.append(reslist)
462        return tab
463
464    def _xcoord(self, axename, ivar, lisfuncname=None, coord=False, **kwargs):
465        ''' Coords generation for Xarray'''
466        #maxlen = kwargs.get('maxlen', 20)
467        #info = self.indexinfos()
468        dic_part = self.field_partition(axename)
469        coords = {}
470        ana = self.analysis
471        for i in range(self.lenindex):
472            #fieldi = info[i]
473            iname = self.lname[i]
474            # if fieldi['pparent'] == -1 or i == ivar:
475            if i in dic_part['variable'] or i in dic_part['unique'] or i == ivar:
476                continue
477            if isinstance(lisfuncname, dict) and len(lisfuncname) == self.lenindex:
478                funci = lisfuncname[iname]
479            else:
480                funci = None
481            if iname in axename:
482                coords[iname] = self.lindex[i].to_numpy(
483                    func=funci, codec=True, **kwargs)
484                if coord:
485                    coords[iname+'_row'] = (iname,
486                                            np.arange(len(coords[iname])))
487                    coords[iname+'_str'] = (iname,
488                                            self.lindex[i].to_numpy(func=str, codec=True))
489            else:
490                #ascendants = self.analysis.fields[i].ascendants('derived', 'index') # !!!!!!
491                #p_prim = [ind for ind in ascendants if self.lname[ind] in axename][0]
492                #p_prim = self.analysis.fields[i].ascendants('derived', 'index')[-1]
493                #self.lindex[i].setkeys(self.lindex[p_prim].keys)  # !!!
494                #coords[iname] = (self.lname[p_prim],
495                #                 self.lindex[i].to_numpy(func=funci, codec=True, **kwargs))
496                f_prim = [self.nindex(name) for name in axename if
497                  ana.get_relation(i, name).typecoupl in ['derived', 'coupled']][0]
498                self.lindex[i].setkeys(f_prim.keys)  # !!!
499                coords[iname] = (f_prim.name, self.lindex[i].to_numpy(
500                                            func=funci, codec=True, **kwargs))
501        return coords