tab-dataset.tab_dataset.cdataset

The cdataset module is part of the tab-dataset package.

It contains the classes DatasetAnalysis, Cdataset for Dataset entities.

For more information, see the user guide or the github repository.

  1# -*- coding: utf-8 -*-
  2"""
  3The `cdataset` module is part of the `tab-dataset` package.
  4
  5It contains the classes `DatasetAnalysis`, `Cdataset` for Dataset entities.
  6
  7For more information, see the 
  8[user guide](https://loco-philippe.github.io/tab-dataset/docs/user_guide.html) 
  9or the [github repository](https://github.com/loco-philippe/tab-dataset).
 10"""
 11from copy import copy
 12
 13from tab_dataset.cfield import Cfield, Cutil
 14
 15from json_ntv.ntv import Ntv
 16from json_ntv.ntv_util import NtvUtil, NtvConnector
 17
 18from tab_analysis.analysis import AnaDataset, Util
 19
 20
 21class DatasetAnalysis:
 22    '''This class is the Cdataset interface class with the tab_analysis module.'''
 23
 24# %% property
 25    @property
 26    def analysis(self):
 27        '''The analysis attribute is associated to the AnaDataset object'''
 28        if self._analysis is None or self._analysis.hashd != self._hashd:
 29            self._analysis = AnaDataset(self.to_analysis(True))
 30        return self._analysis
 31
 32    @property
 33    def anafields(self):
 34        ''' list of AnaField'''
 35        return self.analysis.fields
 36
 37    @property
 38    def partitions(self):
 39        ''' list of partitions defined with index representation (AnaDataset method)'''
 40        return self.analysis.partitions('index')
 41
 42    @property
 43    def complete(self):
 44        ''' complete property of the dataset (AnaDataset method)'''
 45        return self.analysis.complete
 46
 47    @property
 48    def dimension(self):
 49        ''' dimension of the dataset (AnaDataset method)'''
 50        return self.analysis.dimension
 51
 52    @property
 53    def lvarname(self):
 54        ''' list of variable Field name (AnaDataset method)'''
 55        return Util.view(self.analysis.variable, mode='id')
 56
 57    @property
 58    def primaryname(self):
 59        ''' list of primary name (AnaDataset method)'''
 60        return Util.view(self.analysis.primary, mode='id')
 61
 62    @property
 63    def secondaryname(self):
 64        ''' list of secondary name (AnaDataset method)'''
 65        return Util.view(self.analysis.secondary, mode='id')
 66
 67
 68# %% methods
 69
 70    def indexinfos(self, keys=None):
 71        '''return a dict with infos of each index (AnaDataset method) :
 72            
 73        - num, name, cat, diffdistparent, child, parent, distparent,
 74        crossed, pparent, rateder (struct info)
 75        - lencodec, mincodec, maxcodec, typecodec, ratecodec (base info)
 76
 77        *Parameters*
 78
 79        - **keys** : string, list or tuple (default None) - list of attributes
 80        to returned.
 81        if 'all' or None, all attributes are returned.
 82        if 'struct', only structural attributes are returned.
 83
 84        *Returns* : dict'''
 85        return self.analysis.to_dict(mode='index', keys=keys)
 86
 87    def field_partition(self, partition=None, mode='index'):
 88        '''return a partition dict with the list of primary, secondary, unique
 89        and variable fields (index).
 90
 91         *Parameters*
 92
 93        - **partition** : list (default None) - if None, partition is the first
 94        - **mode** : str (default 'index') - Field representation ('id', 'index')
 95        '''
 96        if not partition and len(self.partitions) > 0:
 97            partition = self.partitions[0]
 98        part = [self.analysis.dfield(fld)
 99                for fld in partition] if partition else None
100        return self.analysis.field_partition(mode=mode, partition=part,
101                                             distributed=True)
102
103    def relation(self, fld1, fld2):
104        '''relationship between two fields (AnaDataset method)'''
105        return self.analysis.get_relation(fld1, fld2)
106
107    def tree(self, mode='derived', width=5, lname=20, string=True):
108        '''return a string with a tree of derived Field (AnaDataset method).
109
110         *Parameters*
111
112        - **lname** : integer (default 20) - length of the names
113        - **width** : integer (default 5) - length of the lines
114        - **string** : boolean (default True) - if True return str else return dict
115        - **mode** : string (default 'derived') - kind of tree :
116            'derived' : derived tree
117            'distance': min distance tree
118            'distomin': min distomin tree
119        '''
120        return self.analysis.tree(mode=mode, width=width, lname=lname, string=string)
121
122    def indicator(self, fullsize=None, size=None):
123        '''generate size indicators: ol (object lightness), ul (unicity level),
124        gain (sizegain)
125
126        *Parameters*
127
128        - **fullsize** : int (default none) - size with full codec
129        - **size** : int (default none) - size with existing codec
130
131        *Returns* : dict'''
132        if not fullsize:
133            fullsize = len(self.to_obj(encoded=True, modecodec='full'))
134        if not size:
135            size = len(self.to_obj(encoded=True))
136        return self.analysis.indicator(fullsize, size)
137
138
139class Cdataset(DatasetAnalysis):
140    # %% magic
141    '''
142    A `Cdataset` is a representation of a tabular data.
143
144    *Attributes (for @property see methods)* :
145
146    - **lindex** : list of Field
147    - **name** : name of the Cdataset
148    - **_analysis** : AnaDataset object
149
150    The methods defined in this class are :
151
152    *constructor (@classmethod)*
153
154    - `Cdataset.ntv`
155    - `Cdataset.from_ntv`
156
157    *dynamic value - module analysis (getters @property)*
158
159    - `DatasetAnalysis.analysis`
160    - `DatasetAnalysis.anafields`
161    - `DatasetAnalysis.lvarname`
162    - `DatasetAnalysis.partitions`
163    - `DatasetAnalysis.primaryname`
164    - `DatasetAnalysis.secondaryname`
165    - `DatasetAnalysis.complete`
166    - `DatasetAnalysis.dimension`
167
168    *selecting - infos methods (module analysis)*
169
170    - `DatasetAnalysis.field_partition`
171    - `DatasetAnalysis.indexinfos`
172    - `DatasetAnalysis.indicator`
173    - `DatasetAnalysis.relation`
174    - `DatasetAnalysis.tree`
175
176    *dynamic value (getters @property)*
177
178    - `Cdataset.keys`
179    - `Cdataset.iindex`
180    - `Cdataset.indexlen`
181    - `Cdataset.lenindex`
182    - `Cdataset.lname`
183    - `Cdataset.lunicname`
184    - `Cdataset.lunicrow`
185    - `Cdataset.tiindex`
186
187    *add - update methods (`observation.dataset_structure.DatasetStructure`)*
188
189    - `Cdataset.add`
190    - `Cdataset.delindex`
191    - `Cdataset.renameindex`
192    - `Cdataset.setname`
193
194    *structure management - methods (`observation.dataset_structure.DatasetStructure`)*
195
196    - `Cdataset.check_relation`
197    - `Cdataset.check_relationship`
198    - `Cdataset.nindex`
199    - `Cdataset.reindex`
200    - `Cdataset.reorder`
201    - `Cdataset.swapindex`
202    - `Cdataset.to_analysis`
203    '''
204    field_class = Cfield
205
206    def __init__(self, listidx=None, name=None, reindex=True):
207        '''
208        Dataset constructor.
209
210        *Parameters*
211
212        - **listidx** :  list (default None) - list of Field data
213        - **name** :  string (default None) - name of the dataset
214        - **reindex** : boolean (default True) - if True, default codec for each Field'''
215
216        if isinstance(listidx, Cdataset):
217            self.lindex = [copy(idx) for idx in listidx.lindex]
218            self.name = name if name else listidx.name
219            self._analysis = listidx._analysis
220            return
221        if listidx.__class__.__name__ == 'DataFrame':
222            lindex = NtvConnector.connector(
223            )['DataFrameConnec'].to_listidx(listidx)[0]
224            #listidx = [Cfield(field['codec'], field['name'], field['keys'])
225            listidx = [self.field_class(field['codec'], field['name'], field['keys'])
226                       for field in lindex]
227        self.name = name
228        self.lindex = [] if listidx is None else listidx
229        if reindex:
230            self.reindex()
231        self._analysis = None
232        return
233
234    def __repr__(self):
235        '''return classname, number of value and number of indexes'''
236        return self.__class__.__name__ + '[' + str(len(self)) + ', ' + str(self.lenindex) + ']'
237
238    def __str__(self):
239        '''return string format for var and lidx'''
240        stri = ''
241        stri += 'fields :\n'
242        for idx in self.lindex:
243            stri += '    ' + str(idx) + '\n'
244        return stri
245
246    def __len__(self):
247        ''' len of values'''
248        if not self.lindex:
249            return 0
250        return len(self.lindex[0])
251
252    def __contains__(self, item):
253        ''' list of lindex values'''
254        return item in self.lindex
255
256    def __getitem__(self, ind):
257        ''' return value record (value conversion)'''
258        res = [idx[ind] for idx in self.lindex]
259        if len(res) == 1:
260            return res[0]
261        return res
262
263    def __setitem__(self, ind, item):
264        ''' modify the Field values for each Field at the row ind'''
265        if not isinstance(item, list):
266            item = [item]
267        for val, idx in zip(item, self.lindex):
268            idx[ind] = val
269
270    def __delitem__(self, ind):
271        ''' remove all Field item at the row ind'''
272        for idx in self.lindex:
273            del idx[ind]
274
275    def __hash__(self):
276        '''return hash of all hash(Field)'''
277        #return hash(tuple(hash(idx) for idx in self.lindex))
278        return sum(hash(idx) for idx in self.lindex)
279
280    def __eq__(self, other):
281        ''' equal if hash values are equal'''
282        return hash(self) == hash(other)
283
284    def __copy__(self):
285        ''' Copy all the data '''
286        return self.__class__(self)
287
288# %% property
289    @property
290    def _hashd(self):
291        '''return hash of all hashf(Field)'''
292        # return sum([idx._hashi() for idx in self.lindex])
293        return hash(tuple(fld.hashf for fld in self.lindex))
294
295    @property
296    def indexlen(self):
297        ''' list of index codec length'''
298        return [len(idx.codec) for idx in self.lindex]
299
300    @property
301    def iindex(self):
302        ''' list of keys for each index'''
303        return [idx.keys for idx in self.lindex]
304
305    @property
306    def keys(self):
307        ''' list of keys for each index'''
308        return [idx.keys for idx in self.lindex]
309
310    @property
311    def lenindex(self):
312        ''' number of indexes'''
313        return len(self.lindex)
314
315    @property
316    def lunicname(self):
317        ''' list of unique index name'''
318        return [idx.name for idx in self.lindex if len(idx.codec) == 1]
319
320    @property
321    def lunicrow(self):
322        '''list of unic idx row'''
323        return [self.lname.index(name) for name in self.lunicname]
324
325    @property
326    def lname(self):
327        ''' list of index name'''
328        return [idx.name for idx in self.lindex]
329
330    @property
331    def tiindex(self):
332        ''' list of keys for each record'''
333        return Cutil.list(list(zip(*self.iindex)))
334
335# %%methods
336
337    @classmethod
338    def ntv(cls, ntv_value, reindex=True, fast=False):
339        '''Generate an Dataset Object from a ntv_value
340
341        *Parameters*
342
343        - **ntv_value** : bytes, string, Ntv object to convert
344        - **reindex** : boolean (default True) - if True, default codec for each Field
345        - **fast** : boolean (default False) - if True, ntv_value are not converted in json-value'''
346        return cls.from_ntv(ntv_value, reindex=reindex, fast=fast)
347
348    @classmethod
349    def from_ntv(cls, ntv_value, reindex=True, decode_str=False, fast=False):
350        '''Generate a Dataset Object from a ntv_value
351
352        *Parameters*
353
354        - **ntv_value** : bytes, string, Ntv object to convert
355        - **reindex** : boolean (default True) - if True, default codec for each Field
356        - **decode_str**: boolean (default False) - if True, string are loaded in json data
357        - **fast** : boolean (default False) - if True, ntv_value are not converted in json-value'''
358        ntv = Ntv.obj(ntv_value, decode_str=decode_str, fast=fast)
359        if len(ntv) == 0:
360            return cls()
361        lidx = [list(NtvUtil.decode_ntv_tab(
362            ntvf, cls.field_class.ntv_to_val)) for ntvf in ntv]
363        leng = max(idx[6] for idx in lidx)
364        for ind in range(len(lidx)):
365            if lidx[ind][0] == '':
366                lidx[ind][0] = 'i'+str(ind)
367            NtvConnector.init_ntv_keys(ind, lidx, leng)
368        lindex = [cls.field_class(idx[2], idx[0], idx[4], None,  # idx[1] pour le type,
369                                  reindex=reindex) for idx in lidx]
370        return cls(lindex, reindex=reindex, name=ntv.name)
371
372    def add(self, other, name=False, solve=True):
373        ''' Add other's values to self's values for each index
374
375        *Parameters*
376
377        - **other** : Dataset object to add to self object
378        - **name** : Boolean (default False) - Add values with same index name (True) or
379        same index row (False)
380        - **solve** : Boolean (default True) - If True, replace None other's codec value
381        with self codec value.
382
383        *Returns* : self '''
384        if self.lenindex != other.lenindex:
385            raise DatasetError('length are not identical')
386        if name and sorted(self.lname) != sorted(other.lname):
387            raise DatasetError('name are not identical')
388        for i in range(self.lenindex):
389            if name:
390                self.lindex[i].add(other.lindex[other.lname.index(self.lname[i])],
391                                   solve=solve)
392            else:
393                self.lindex[i].add(other.lindex[i], solve=solve)
394        return self
395
396    def to_analysis(self, distr=False):
397        '''return a dict with data used in AnaDataset module
398
399        *Parameters*
400
401        - **distr** : Boolean (default False) - If True, add distr information'''
402        return {'name': self.name, 'fields': [fld.to_analysis for fld in self.lindex],
403                'length': len(self), 'hashd': self._hashd,
404                'relations': {self.lindex[i].name:
405                              {self.lindex[j].name: Cutil.dist(
406                                  self.lindex[i].keys, self.lindex[j].keys, distr)
407                               for j in range(i+1, len(self.lindex))}
408                              for i in range(len(self.lindex)-1)}
409                }
410
411    def reindex(self):
412        '''Calculate a new default codec for each index (Return self)'''
413        for idx in self.lindex:
414            idx.reindex()
415        return self
416
417    def delindex(self, delname=None, savename=None):
418        '''remove an Field or a list of Field.
419
420        *Parameters*
421
422        - **delname** : string or list of string - name of index to remove
423        - **savename** : string or list of string - name of index to keep
424
425        *Returns* : none '''
426        if not delname and not savename:
427            return
428        if isinstance(delname, str):
429            delname = [delname]
430        if isinstance(savename, str):
431            savename = [savename]
432        if delname and savename:
433            delname = [name for name in delname if not name in savename]
434        if not delname:
435            delname = [name for name in self.lname if not name in savename]
436        for idxname in delname:
437            if idxname in self.lname:
438                self.lindex.pop(self.lname.index(idxname))
439
440    def nindex(self, name):
441        ''' index with name equal to attribute name'''
442        if name in self.lname:
443            return self.lindex[self.lname.index(name)]
444        return None
445
446    def renameindex(self, oldname, newname):
447        '''replace an index name 'oldname' by a new one 'newname'. '''
448        for i in range(self.lenindex):
449            if self.lname[i] == oldname:
450                self.lindex[i].setname(newname)
451        for i in range(len(self.lvarname)):
452            if self.lvarname[i] == oldname:
453                self.lvarname[i] = newname
454
455    def reorder(self, recorder=None):
456        '''Reorder records in the order define by 'recorder' '''
457        if recorder is None or set(recorder) != set(range(len(self))):
458            return None
459        for idx in self.lindex:
460            idx.set_keys([idx.keys[i] for i in recorder])
461        return None
462
463    def setname(self, listname=None):
464        '''Update Field name by the name in listname'''
465        for i in range(min(self.lenindex, len(listname))):
466            self.lindex[i].name = listname[i]
467
468    def swapindex(self, order):
469        '''
470        Change the order of the index .
471
472        *Parameters*
473
474        - **order** : list of int or list of name - new order of index to apply.
475
476        *Returns* : self '''
477        if self.lenindex != len(order):
478            raise DatasetError('length of order and Dataset different')
479        if not order or isinstance(order[0], int):
480            self.lindex = [self.lindex[ind] for ind in order]
481        elif isinstance(order[0], str):
482            self.lindex = [self.nindex(name) for name in order]
483        return self
484
485    def check_relation(self, parent, field, typecoupl, value=True):
486        '''get the inconsistent records for a relationship
487
488         *Parameters*
489
490        - **field** : int or str - index or name of the field involved in the relation
491        - **parent**: int or str - index or name of the second field involved in the relation
492        - **typecoupl**: str - relationship to check ('derived' or 'coupled')
493        - **value**: boolean (default True) - if True return a dict with inconsistent
494        values of the fields, else a tuple with index of records)
495
496        *Returns* :
497
498        - dict with inconsistent values of the fields
499        - or a tuple with index of records'''
500        f_parent = copy(self.nindex(parent) if isinstance(parent, str)
501                                            else self.lindex[parent])
502        f_field = copy(self.nindex(field) if isinstance(field, str)
503                                          else self.lindex[field])
504        return Cfield.check_relation(f_parent, f_field, typecoupl, value)
505
506    def check_relationship(self, relations):
507        '''get the inconsistent records for each relationship defined in relations
508
509         *Parameters*
510
511        - **relations** : list of dict or single dict - list of fields with relationship property
512
513        *Returns* :
514
515        - dict with for each relationship: key = string with the two fields name,
516        and value = list of inconsistent records
517        - or if single relationship : value'''
518        if not isinstance(relations, (list, dict)):
519            raise DatasetError("relations is not correct")
520        if isinstance(relations, dict):
521            relations = [relations]
522        dic_res = {}
523        for field in relations:
524            if not 'relationship' in field or not 'name' in field:
525                continue
526            if not 'parent' in field['relationship'] or not 'link' in field['relationship']:
527                raise DatasetError("relationship is not correct")
528            rel = field['relationship']['link']
529            f_parent = field['relationship']['parent']
530            f_field = field['name']
531            name_rel = f_field + ' - ' + f_parent
532            if self.nindex(f_parent) is None or self.nindex(f_field) is None:
533                raise DatasetError("field's name is not present in data")
534            dic_res[name_rel] = self.check_relation(f_parent, f_field, rel, False)
535        if len(dic_res) == 1:
536            return list(dic_res.values())[0]
537        return dic_res
538
539
540class DatasetError(Exception):
541    # %% errors
542    ''' Dataset Exception'''
543    # pass
class DatasetAnalysis:
 22class DatasetAnalysis:
 23    '''This class is the Cdataset interface class with the tab_analysis module.'''
 24
 25# %% property
 26    @property
 27    def analysis(self):
 28        '''The analysis attribute is associated to the AnaDataset object'''
 29        if self._analysis is None or self._analysis.hashd != self._hashd:
 30            self._analysis = AnaDataset(self.to_analysis(True))
 31        return self._analysis
 32
 33    @property
 34    def anafields(self):
 35        ''' list of AnaField'''
 36        return self.analysis.fields
 37
 38    @property
 39    def partitions(self):
 40        ''' list of partitions defined with index representation (AnaDataset method)'''
 41        return self.analysis.partitions('index')
 42
 43    @property
 44    def complete(self):
 45        ''' complete property of the dataset (AnaDataset method)'''
 46        return self.analysis.complete
 47
 48    @property
 49    def dimension(self):
 50        ''' dimension of the dataset (AnaDataset method)'''
 51        return self.analysis.dimension
 52
 53    @property
 54    def lvarname(self):
 55        ''' list of variable Field name (AnaDataset method)'''
 56        return Util.view(self.analysis.variable, mode='id')
 57
 58    @property
 59    def primaryname(self):
 60        ''' list of primary name (AnaDataset method)'''
 61        return Util.view(self.analysis.primary, mode='id')
 62
 63    @property
 64    def secondaryname(self):
 65        ''' list of secondary name (AnaDataset method)'''
 66        return Util.view(self.analysis.secondary, mode='id')
 67
 68
 69# %% methods
 70
 71    def indexinfos(self, keys=None):
 72        '''return a dict with infos of each index (AnaDataset method) :
 73            
 74        - num, name, cat, diffdistparent, child, parent, distparent,
 75        crossed, pparent, rateder (struct info)
 76        - lencodec, mincodec, maxcodec, typecodec, ratecodec (base info)
 77
 78        *Parameters*
 79
 80        - **keys** : string, list or tuple (default None) - list of attributes
 81        to returned.
 82        if 'all' or None, all attributes are returned.
 83        if 'struct', only structural attributes are returned.
 84
 85        *Returns* : dict'''
 86        return self.analysis.to_dict(mode='index', keys=keys)
 87
 88    def field_partition(self, partition=None, mode='index'):
 89        '''return a partition dict with the list of primary, secondary, unique
 90        and variable fields (index).
 91
 92         *Parameters*
 93
 94        - **partition** : list (default None) - if None, partition is the first
 95        - **mode** : str (default 'index') - Field representation ('id', 'index')
 96        '''
 97        if not partition and len(self.partitions) > 0:
 98            partition = self.partitions[0]
 99        part = [self.analysis.dfield(fld)
100                for fld in partition] if partition else None
101        return self.analysis.field_partition(mode=mode, partition=part,
102                                             distributed=True)
103
104    def relation(self, fld1, fld2):
105        '''relationship between two fields (AnaDataset method)'''
106        return self.analysis.get_relation(fld1, fld2)
107
108    def tree(self, mode='derived', width=5, lname=20, string=True):
109        '''return a string with a tree of derived Field (AnaDataset method).
110
111         *Parameters*
112
113        - **lname** : integer (default 20) - length of the names
114        - **width** : integer (default 5) - length of the lines
115        - **string** : boolean (default True) - if True return str else return dict
116        - **mode** : string (default 'derived') - kind of tree :
117            'derived' : derived tree
118            'distance': min distance tree
119            'distomin': min distomin tree
120        '''
121        return self.analysis.tree(mode=mode, width=width, lname=lname, string=string)
122
123    def indicator(self, fullsize=None, size=None):
124        '''generate size indicators: ol (object lightness), ul (unicity level),
125        gain (sizegain)
126
127        *Parameters*
128
129        - **fullsize** : int (default none) - size with full codec
130        - **size** : int (default none) - size with existing codec
131
132        *Returns* : dict'''
133        if not fullsize:
134            fullsize = len(self.to_obj(encoded=True, modecodec='full'))
135        if not size:
136            size = len(self.to_obj(encoded=True))
137        return self.analysis.indicator(fullsize, size)

This class is the Cdataset interface class with the tab_analysis module.

analysis

The analysis attribute is associated to the AnaDataset object

anafields

list of AnaField

partitions

list of partitions defined with index representation (AnaDataset method)

complete

complete property of the dataset (AnaDataset method)

dimension

dimension of the dataset (AnaDataset method)

lvarname

list of variable Field name (AnaDataset method)

primaryname

list of primary name (AnaDataset method)

secondaryname

list of secondary name (AnaDataset method)

def indexinfos(self, keys=None):
71    def indexinfos(self, keys=None):
72        '''return a dict with infos of each index (AnaDataset method) :
73            
74        - num, name, cat, diffdistparent, child, parent, distparent,
75        crossed, pparent, rateder (struct info)
76        - lencodec, mincodec, maxcodec, typecodec, ratecodec (base info)
77
78        *Parameters*
79
80        - **keys** : string, list or tuple (default None) - list of attributes
81        to returned.
82        if 'all' or None, all attributes are returned.
83        if 'struct', only structural attributes are returned.
84
85        *Returns* : dict'''
86        return self.analysis.to_dict(mode='index', keys=keys)

return a dict with infos of each index (AnaDataset method) :

  • num, name, cat, diffdistparent, child, parent, distparent, crossed, pparent, rateder (struct info)
  • lencodec, mincodec, maxcodec, typecodec, ratecodec (base info)

Parameters

  • keys : string, list or tuple (default None) - list of attributes to returned. if 'all' or None, all attributes are returned. if 'struct', only structural attributes are returned.

Returns : dict

def field_partition(self, partition=None, mode='index'):
 88    def field_partition(self, partition=None, mode='index'):
 89        '''return a partition dict with the list of primary, secondary, unique
 90        and variable fields (index).
 91
 92         *Parameters*
 93
 94        - **partition** : list (default None) - if None, partition is the first
 95        - **mode** : str (default 'index') - Field representation ('id', 'index')
 96        '''
 97        if not partition and len(self.partitions) > 0:
 98            partition = self.partitions[0]
 99        part = [self.analysis.dfield(fld)
100                for fld in partition] if partition else None
101        return self.analysis.field_partition(mode=mode, partition=part,
102                                             distributed=True)

return a partition dict with the list of primary, secondary, unique and variable fields (index).

Parameters

  • partition : list (default None) - if None, partition is the first
  • mode : str (default 'index') - Field representation ('id', 'index')
def relation(self, fld1, fld2):
104    def relation(self, fld1, fld2):
105        '''relationship between two fields (AnaDataset method)'''
106        return self.analysis.get_relation(fld1, fld2)

relationship between two fields (AnaDataset method)

def tree(self, mode='derived', width=5, lname=20, string=True):
108    def tree(self, mode='derived', width=5, lname=20, string=True):
109        '''return a string with a tree of derived Field (AnaDataset method).
110
111         *Parameters*
112
113        - **lname** : integer (default 20) - length of the names
114        - **width** : integer (default 5) - length of the lines
115        - **string** : boolean (default True) - if True return str else return dict
116        - **mode** : string (default 'derived') - kind of tree :
117            'derived' : derived tree
118            'distance': min distance tree
119            'distomin': min distomin tree
120        '''
121        return self.analysis.tree(mode=mode, width=width, lname=lname, string=string)

return a string with a tree of derived Field (AnaDataset method).

Parameters

  • lname : integer (default 20) - length of the names
  • width : integer (default 5) - length of the lines
  • string : boolean (default True) - if True return str else return dict
  • mode : string (default 'derived') - kind of tree : 'derived' : derived tree 'distance': min distance tree 'distomin': min distomin tree
def indicator(self, fullsize=None, size=None):
123    def indicator(self, fullsize=None, size=None):
124        '''generate size indicators: ol (object lightness), ul (unicity level),
125        gain (sizegain)
126
127        *Parameters*
128
129        - **fullsize** : int (default none) - size with full codec
130        - **size** : int (default none) - size with existing codec
131
132        *Returns* : dict'''
133        if not fullsize:
134            fullsize = len(self.to_obj(encoded=True, modecodec='full'))
135        if not size:
136            size = len(self.to_obj(encoded=True))
137        return self.analysis.indicator(fullsize, size)

generate size indicators: ol (object lightness), ul (unicity level), gain (sizegain)

Parameters

  • fullsize : int (default none) - size with full codec
  • size : int (default none) - size with existing codec

Returns : dict

class Cdataset(DatasetAnalysis):
140class Cdataset(DatasetAnalysis):
141    # %% magic
142    '''
143    A `Cdataset` is a representation of a tabular data.
144
145    *Attributes (for @property see methods)* :
146
147    - **lindex** : list of Field
148    - **name** : name of the Cdataset
149    - **_analysis** : AnaDataset object
150
151    The methods defined in this class are :
152
153    *constructor (@classmethod)*
154
155    - `Cdataset.ntv`
156    - `Cdataset.from_ntv`
157
158    *dynamic value - module analysis (getters @property)*
159
160    - `DatasetAnalysis.analysis`
161    - `DatasetAnalysis.anafields`
162    - `DatasetAnalysis.lvarname`
163    - `DatasetAnalysis.partitions`
164    - `DatasetAnalysis.primaryname`
165    - `DatasetAnalysis.secondaryname`
166    - `DatasetAnalysis.complete`
167    - `DatasetAnalysis.dimension`
168
169    *selecting - infos methods (module analysis)*
170
171    - `DatasetAnalysis.field_partition`
172    - `DatasetAnalysis.indexinfos`
173    - `DatasetAnalysis.indicator`
174    - `DatasetAnalysis.relation`
175    - `DatasetAnalysis.tree`
176
177    *dynamic value (getters @property)*
178
179    - `Cdataset.keys`
180    - `Cdataset.iindex`
181    - `Cdataset.indexlen`
182    - `Cdataset.lenindex`
183    - `Cdataset.lname`
184    - `Cdataset.lunicname`
185    - `Cdataset.lunicrow`
186    - `Cdataset.tiindex`
187
188    *add - update methods (`observation.dataset_structure.DatasetStructure`)*
189
190    - `Cdataset.add`
191    - `Cdataset.delindex`
192    - `Cdataset.renameindex`
193    - `Cdataset.setname`
194
195    *structure management - methods (`observation.dataset_structure.DatasetStructure`)*
196
197    - `Cdataset.check_relation`
198    - `Cdataset.check_relationship`
199    - `Cdataset.nindex`
200    - `Cdataset.reindex`
201    - `Cdataset.reorder`
202    - `Cdataset.swapindex`
203    - `Cdataset.to_analysis`
204    '''
205    field_class = Cfield
206
207    def __init__(self, listidx=None, name=None, reindex=True):
208        '''
209        Dataset constructor.
210
211        *Parameters*
212
213        - **listidx** :  list (default None) - list of Field data
214        - **name** :  string (default None) - name of the dataset
215        - **reindex** : boolean (default True) - if True, default codec for each Field'''
216
217        if isinstance(listidx, Cdataset):
218            self.lindex = [copy(idx) for idx in listidx.lindex]
219            self.name = name if name else listidx.name
220            self._analysis = listidx._analysis
221            return
222        if listidx.__class__.__name__ == 'DataFrame':
223            lindex = NtvConnector.connector(
224            )['DataFrameConnec'].to_listidx(listidx)[0]
225            #listidx = [Cfield(field['codec'], field['name'], field['keys'])
226            listidx = [self.field_class(field['codec'], field['name'], field['keys'])
227                       for field in lindex]
228        self.name = name
229        self.lindex = [] if listidx is None else listidx
230        if reindex:
231            self.reindex()
232        self._analysis = None
233        return
234
235    def __repr__(self):
236        '''return classname, number of value and number of indexes'''
237        return self.__class__.__name__ + '[' + str(len(self)) + ', ' + str(self.lenindex) + ']'
238
239    def __str__(self):
240        '''return string format for var and lidx'''
241        stri = ''
242        stri += 'fields :\n'
243        for idx in self.lindex:
244            stri += '    ' + str(idx) + '\n'
245        return stri
246
247    def __len__(self):
248        ''' len of values'''
249        if not self.lindex:
250            return 0
251        return len(self.lindex[0])
252
253    def __contains__(self, item):
254        ''' list of lindex values'''
255        return item in self.lindex
256
257    def __getitem__(self, ind):
258        ''' return value record (value conversion)'''
259        res = [idx[ind] for idx in self.lindex]
260        if len(res) == 1:
261            return res[0]
262        return res
263
264    def __setitem__(self, ind, item):
265        ''' modify the Field values for each Field at the row ind'''
266        if not isinstance(item, list):
267            item = [item]
268        for val, idx in zip(item, self.lindex):
269            idx[ind] = val
270
271    def __delitem__(self, ind):
272        ''' remove all Field item at the row ind'''
273        for idx in self.lindex:
274            del idx[ind]
275
276    def __hash__(self):
277        '''return hash of all hash(Field)'''
278        #return hash(tuple(hash(idx) for idx in self.lindex))
279        return sum(hash(idx) for idx in self.lindex)
280
281    def __eq__(self, other):
282        ''' equal if hash values are equal'''
283        return hash(self) == hash(other)
284
285    def __copy__(self):
286        ''' Copy all the data '''
287        return self.__class__(self)
288
289# %% property
290    @property
291    def _hashd(self):
292        '''return hash of all hashf(Field)'''
293        # return sum([idx._hashi() for idx in self.lindex])
294        return hash(tuple(fld.hashf for fld in self.lindex))
295
296    @property
297    def indexlen(self):
298        ''' list of index codec length'''
299        return [len(idx.codec) for idx in self.lindex]
300
301    @property
302    def iindex(self):
303        ''' list of keys for each index'''
304        return [idx.keys for idx in self.lindex]
305
306    @property
307    def keys(self):
308        ''' list of keys for each index'''
309        return [idx.keys for idx in self.lindex]
310
311    @property
312    def lenindex(self):
313        ''' number of indexes'''
314        return len(self.lindex)
315
316    @property
317    def lunicname(self):
318        ''' list of unique index name'''
319        return [idx.name for idx in self.lindex if len(idx.codec) == 1]
320
321    @property
322    def lunicrow(self):
323        '''list of unic idx row'''
324        return [self.lname.index(name) for name in self.lunicname]
325
326    @property
327    def lname(self):
328        ''' list of index name'''
329        return [idx.name for idx in self.lindex]
330
331    @property
332    def tiindex(self):
333        ''' list of keys for each record'''
334        return Cutil.list(list(zip(*self.iindex)))
335
336# %%methods
337
338    @classmethod
339    def ntv(cls, ntv_value, reindex=True, fast=False):
340        '''Generate an Dataset Object from a ntv_value
341
342        *Parameters*
343
344        - **ntv_value** : bytes, string, Ntv object to convert
345        - **reindex** : boolean (default True) - if True, default codec for each Field
346        - **fast** : boolean (default False) - if True, ntv_value are not converted in json-value'''
347        return cls.from_ntv(ntv_value, reindex=reindex, fast=fast)
348
349    @classmethod
350    def from_ntv(cls, ntv_value, reindex=True, decode_str=False, fast=False):
351        '''Generate a Dataset Object from a ntv_value
352
353        *Parameters*
354
355        - **ntv_value** : bytes, string, Ntv object to convert
356        - **reindex** : boolean (default True) - if True, default codec for each Field
357        - **decode_str**: boolean (default False) - if True, string are loaded in json data
358        - **fast** : boolean (default False) - if True, ntv_value are not converted in json-value'''
359        ntv = Ntv.obj(ntv_value, decode_str=decode_str, fast=fast)
360        if len(ntv) == 0:
361            return cls()
362        lidx = [list(NtvUtil.decode_ntv_tab(
363            ntvf, cls.field_class.ntv_to_val)) for ntvf in ntv]
364        leng = max(idx[6] for idx in lidx)
365        for ind in range(len(lidx)):
366            if lidx[ind][0] == '':
367                lidx[ind][0] = 'i'+str(ind)
368            NtvConnector.init_ntv_keys(ind, lidx, leng)
369        lindex = [cls.field_class(idx[2], idx[0], idx[4], None,  # idx[1] pour le type,
370                                  reindex=reindex) for idx in lidx]
371        return cls(lindex, reindex=reindex, name=ntv.name)
372
373    def add(self, other, name=False, solve=True):
374        ''' Add other's values to self's values for each index
375
376        *Parameters*
377
378        - **other** : Dataset object to add to self object
379        - **name** : Boolean (default False) - Add values with same index name (True) or
380        same index row (False)
381        - **solve** : Boolean (default True) - If True, replace None other's codec value
382        with self codec value.
383
384        *Returns* : self '''
385        if self.lenindex != other.lenindex:
386            raise DatasetError('length are not identical')
387        if name and sorted(self.lname) != sorted(other.lname):
388            raise DatasetError('name are not identical')
389        for i in range(self.lenindex):
390            if name:
391                self.lindex[i].add(other.lindex[other.lname.index(self.lname[i])],
392                                   solve=solve)
393            else:
394                self.lindex[i].add(other.lindex[i], solve=solve)
395        return self
396
397    def to_analysis(self, distr=False):
398        '''return a dict with data used in AnaDataset module
399
400        *Parameters*
401
402        - **distr** : Boolean (default False) - If True, add distr information'''
403        return {'name': self.name, 'fields': [fld.to_analysis for fld in self.lindex],
404                'length': len(self), 'hashd': self._hashd,
405                'relations': {self.lindex[i].name:
406                              {self.lindex[j].name: Cutil.dist(
407                                  self.lindex[i].keys, self.lindex[j].keys, distr)
408                               for j in range(i+1, len(self.lindex))}
409                              for i in range(len(self.lindex)-1)}
410                }
411
412    def reindex(self):
413        '''Calculate a new default codec for each index (Return self)'''
414        for idx in self.lindex:
415            idx.reindex()
416        return self
417
418    def delindex(self, delname=None, savename=None):
419        '''remove an Field or a list of Field.
420
421        *Parameters*
422
423        - **delname** : string or list of string - name of index to remove
424        - **savename** : string or list of string - name of index to keep
425
426        *Returns* : none '''
427        if not delname and not savename:
428            return
429        if isinstance(delname, str):
430            delname = [delname]
431        if isinstance(savename, str):
432            savename = [savename]
433        if delname and savename:
434            delname = [name for name in delname if not name in savename]
435        if not delname:
436            delname = [name for name in self.lname if not name in savename]
437        for idxname in delname:
438            if idxname in self.lname:
439                self.lindex.pop(self.lname.index(idxname))
440
441    def nindex(self, name):
442        ''' index with name equal to attribute name'''
443        if name in self.lname:
444            return self.lindex[self.lname.index(name)]
445        return None
446
447    def renameindex(self, oldname, newname):
448        '''replace an index name 'oldname' by a new one 'newname'. '''
449        for i in range(self.lenindex):
450            if self.lname[i] == oldname:
451                self.lindex[i].setname(newname)
452        for i in range(len(self.lvarname)):
453            if self.lvarname[i] == oldname:
454                self.lvarname[i] = newname
455
456    def reorder(self, recorder=None):
457        '''Reorder records in the order define by 'recorder' '''
458        if recorder is None or set(recorder) != set(range(len(self))):
459            return None
460        for idx in self.lindex:
461            idx.set_keys([idx.keys[i] for i in recorder])
462        return None
463
464    def setname(self, listname=None):
465        '''Update Field name by the name in listname'''
466        for i in range(min(self.lenindex, len(listname))):
467            self.lindex[i].name = listname[i]
468
469    def swapindex(self, order):
470        '''
471        Change the order of the index .
472
473        *Parameters*
474
475        - **order** : list of int or list of name - new order of index to apply.
476
477        *Returns* : self '''
478        if self.lenindex != len(order):
479            raise DatasetError('length of order and Dataset different')
480        if not order or isinstance(order[0], int):
481            self.lindex = [self.lindex[ind] for ind in order]
482        elif isinstance(order[0], str):
483            self.lindex = [self.nindex(name) for name in order]
484        return self
485
486    def check_relation(self, parent, field, typecoupl, value=True):
487        '''get the inconsistent records for a relationship
488
489         *Parameters*
490
491        - **field** : int or str - index or name of the field involved in the relation
492        - **parent**: int or str - index or name of the second field involved in the relation
493        - **typecoupl**: str - relationship to check ('derived' or 'coupled')
494        - **value**: boolean (default True) - if True return a dict with inconsistent
495        values of the fields, else a tuple with index of records)
496
497        *Returns* :
498
499        - dict with inconsistent values of the fields
500        - or a tuple with index of records'''
501        f_parent = copy(self.nindex(parent) if isinstance(parent, str)
502                                            else self.lindex[parent])
503        f_field = copy(self.nindex(field) if isinstance(field, str)
504                                          else self.lindex[field])
505        return Cfield.check_relation(f_parent, f_field, typecoupl, value)
506
507    def check_relationship(self, relations):
508        '''get the inconsistent records for each relationship defined in relations
509
510         *Parameters*
511
512        - **relations** : list of dict or single dict - list of fields with relationship property
513
514        *Returns* :
515
516        - dict with for each relationship: key = string with the two fields name,
517        and value = list of inconsistent records
518        - or if single relationship : value'''
519        if not isinstance(relations, (list, dict)):
520            raise DatasetError("relations is not correct")
521        if isinstance(relations, dict):
522            relations = [relations]
523        dic_res = {}
524        for field in relations:
525            if not 'relationship' in field or not 'name' in field:
526                continue
527            if not 'parent' in field['relationship'] or not 'link' in field['relationship']:
528                raise DatasetError("relationship is not correct")
529            rel = field['relationship']['link']
530            f_parent = field['relationship']['parent']
531            f_field = field['name']
532            name_rel = f_field + ' - ' + f_parent
533            if self.nindex(f_parent) is None or self.nindex(f_field) is None:
534                raise DatasetError("field's name is not present in data")
535            dic_res[name_rel] = self.check_relation(f_parent, f_field, rel, False)
536        if len(dic_res) == 1:
537            return list(dic_res.values())[0]
538        return dic_res

A Cdataset is a representation of a tabular data.

Attributes (for @property see methods) :

  • lindex : list of Field
  • name : name of the Cdataset
  • _analysis : AnaDataset object

The methods defined in this class are :

constructor (@classmethod)

dynamic value - module analysis (getters @property)

selecting - infos methods (module analysis)

dynamic value (getters @property)

add - update methods (observation.dataset_structure.DatasetStructure)

structure management - methods (observation.dataset_structure.DatasetStructure)

Cdataset(listidx=None, name=None, reindex=True)
207    def __init__(self, listidx=None, name=None, reindex=True):
208        '''
209        Dataset constructor.
210
211        *Parameters*
212
213        - **listidx** :  list (default None) - list of Field data
214        - **name** :  string (default None) - name of the dataset
215        - **reindex** : boolean (default True) - if True, default codec for each Field'''
216
217        if isinstance(listidx, Cdataset):
218            self.lindex = [copy(idx) for idx in listidx.lindex]
219            self.name = name if name else listidx.name
220            self._analysis = listidx._analysis
221            return
222        if listidx.__class__.__name__ == 'DataFrame':
223            lindex = NtvConnector.connector(
224            )['DataFrameConnec'].to_listidx(listidx)[0]
225            #listidx = [Cfield(field['codec'], field['name'], field['keys'])
226            listidx = [self.field_class(field['codec'], field['name'], field['keys'])
227                       for field in lindex]
228        self.name = name
229        self.lindex = [] if listidx is None else listidx
230        if reindex:
231            self.reindex()
232        self._analysis = None
233        return

Dataset constructor.

Parameters

  • listidx : list (default None) - list of Field data
  • name : string (default None) - name of the dataset
  • reindex : boolean (default True) - if True, default codec for each Field
indexlen

list of index codec length

iindex

list of keys for each index

keys

list of keys for each index

lenindex

number of indexes

lunicname

list of unique index name

lunicrow

list of unic idx row

lname

list of index name

tiindex

list of keys for each record

@classmethod
def ntv(cls, ntv_value, reindex=True, fast=False):
338    @classmethod
339    def ntv(cls, ntv_value, reindex=True, fast=False):
340        '''Generate an Dataset Object from a ntv_value
341
342        *Parameters*
343
344        - **ntv_value** : bytes, string, Ntv object to convert
345        - **reindex** : boolean (default True) - if True, default codec for each Field
346        - **fast** : boolean (default False) - if True, ntv_value are not converted in json-value'''
347        return cls.from_ntv(ntv_value, reindex=reindex, fast=fast)

Generate an Dataset Object from a ntv_value

Parameters

  • ntv_value : bytes, string, Ntv object to convert
  • reindex : boolean (default True) - if True, default codec for each Field
  • fast : boolean (default False) - if True, ntv_value are not converted in json-value
@classmethod
def from_ntv(cls, ntv_value, reindex=True, decode_str=False, fast=False):
349    @classmethod
350    def from_ntv(cls, ntv_value, reindex=True, decode_str=False, fast=False):
351        '''Generate a Dataset Object from a ntv_value
352
353        *Parameters*
354
355        - **ntv_value** : bytes, string, Ntv object to convert
356        - **reindex** : boolean (default True) - if True, default codec for each Field
357        - **decode_str**: boolean (default False) - if True, string are loaded in json data
358        - **fast** : boolean (default False) - if True, ntv_value are not converted in json-value'''
359        ntv = Ntv.obj(ntv_value, decode_str=decode_str, fast=fast)
360        if len(ntv) == 0:
361            return cls()
362        lidx = [list(NtvUtil.decode_ntv_tab(
363            ntvf, cls.field_class.ntv_to_val)) for ntvf in ntv]
364        leng = max(idx[6] for idx in lidx)
365        for ind in range(len(lidx)):
366            if lidx[ind][0] == '':
367                lidx[ind][0] = 'i'+str(ind)
368            NtvConnector.init_ntv_keys(ind, lidx, leng)
369        lindex = [cls.field_class(idx[2], idx[0], idx[4], None,  # idx[1] pour le type,
370                                  reindex=reindex) for idx in lidx]
371        return cls(lindex, reindex=reindex, name=ntv.name)

Generate a Dataset Object from a ntv_value

Parameters

  • ntv_value : bytes, string, Ntv object to convert
  • reindex : boolean (default True) - if True, default codec for each Field
  • decode_str: boolean (default False) - if True, string are loaded in json data
  • fast : boolean (default False) - if True, ntv_value are not converted in json-value
def add(self, other, name=False, solve=True):
373    def add(self, other, name=False, solve=True):
374        ''' Add other's values to self's values for each index
375
376        *Parameters*
377
378        - **other** : Dataset object to add to self object
379        - **name** : Boolean (default False) - Add values with same index name (True) or
380        same index row (False)
381        - **solve** : Boolean (default True) - If True, replace None other's codec value
382        with self codec value.
383
384        *Returns* : self '''
385        if self.lenindex != other.lenindex:
386            raise DatasetError('length are not identical')
387        if name and sorted(self.lname) != sorted(other.lname):
388            raise DatasetError('name are not identical')
389        for i in range(self.lenindex):
390            if name:
391                self.lindex[i].add(other.lindex[other.lname.index(self.lname[i])],
392                                   solve=solve)
393            else:
394                self.lindex[i].add(other.lindex[i], solve=solve)
395        return self

Add other's values to self's values for each index

Parameters

  • other : Dataset object to add to self object
  • name : Boolean (default False) - Add values with same index name (True) or same index row (False)
  • solve : Boolean (default True) - If True, replace None other's codec value with self codec value.

Returns : self

def to_analysis(self, distr=False):
397    def to_analysis(self, distr=False):
398        '''return a dict with data used in AnaDataset module
399
400        *Parameters*
401
402        - **distr** : Boolean (default False) - If True, add distr information'''
403        return {'name': self.name, 'fields': [fld.to_analysis for fld in self.lindex],
404                'length': len(self), 'hashd': self._hashd,
405                'relations': {self.lindex[i].name:
406                              {self.lindex[j].name: Cutil.dist(
407                                  self.lindex[i].keys, self.lindex[j].keys, distr)
408                               for j in range(i+1, len(self.lindex))}
409                              for i in range(len(self.lindex)-1)}
410                }

return a dict with data used in AnaDataset module

Parameters

  • distr : Boolean (default False) - If True, add distr information
def reindex(self):
412    def reindex(self):
413        '''Calculate a new default codec for each index (Return self)'''
414        for idx in self.lindex:
415            idx.reindex()
416        return self

Calculate a new default codec for each index (Return self)

def delindex(self, delname=None, savename=None):
418    def delindex(self, delname=None, savename=None):
419        '''remove an Field or a list of Field.
420
421        *Parameters*
422
423        - **delname** : string or list of string - name of index to remove
424        - **savename** : string or list of string - name of index to keep
425
426        *Returns* : none '''
427        if not delname and not savename:
428            return
429        if isinstance(delname, str):
430            delname = [delname]
431        if isinstance(savename, str):
432            savename = [savename]
433        if delname and savename:
434            delname = [name for name in delname if not name in savename]
435        if not delname:
436            delname = [name for name in self.lname if not name in savename]
437        for idxname in delname:
438            if idxname in self.lname:
439                self.lindex.pop(self.lname.index(idxname))

remove an Field or a list of Field.

Parameters

  • delname : string or list of string - name of index to remove
  • savename : string or list of string - name of index to keep

Returns : none

def nindex(self, name):
441    def nindex(self, name):
442        ''' index with name equal to attribute name'''
443        if name in self.lname:
444            return self.lindex[self.lname.index(name)]
445        return None

index with name equal to attribute name

def renameindex(self, oldname, newname):
447    def renameindex(self, oldname, newname):
448        '''replace an index name 'oldname' by a new one 'newname'. '''
449        for i in range(self.lenindex):
450            if self.lname[i] == oldname:
451                self.lindex[i].setname(newname)
452        for i in range(len(self.lvarname)):
453            if self.lvarname[i] == oldname:
454                self.lvarname[i] = newname

replace an index name 'oldname' by a new one 'newname'.

def reorder(self, recorder=None):
456    def reorder(self, recorder=None):
457        '''Reorder records in the order define by 'recorder' '''
458        if recorder is None or set(recorder) != set(range(len(self))):
459            return None
460        for idx in self.lindex:
461            idx.set_keys([idx.keys[i] for i in recorder])
462        return None

Reorder records in the order define by 'recorder'

def setname(self, listname=None):
464    def setname(self, listname=None):
465        '''Update Field name by the name in listname'''
466        for i in range(min(self.lenindex, len(listname))):
467            self.lindex[i].name = listname[i]

Update Field name by the name in listname

def swapindex(self, order):
469    def swapindex(self, order):
470        '''
471        Change the order of the index .
472
473        *Parameters*
474
475        - **order** : list of int or list of name - new order of index to apply.
476
477        *Returns* : self '''
478        if self.lenindex != len(order):
479            raise DatasetError('length of order and Dataset different')
480        if not order or isinstance(order[0], int):
481            self.lindex = [self.lindex[ind] for ind in order]
482        elif isinstance(order[0], str):
483            self.lindex = [self.nindex(name) for name in order]
484        return self

Change the order of the index .

Parameters

  • order : list of int or list of name - new order of index to apply.

Returns : self

def check_relation(self, parent, field, typecoupl, value=True):
486    def check_relation(self, parent, field, typecoupl, value=True):
487        '''get the inconsistent records for a relationship
488
489         *Parameters*
490
491        - **field** : int or str - index or name of the field involved in the relation
492        - **parent**: int or str - index or name of the second field involved in the relation
493        - **typecoupl**: str - relationship to check ('derived' or 'coupled')
494        - **value**: boolean (default True) - if True return a dict with inconsistent
495        values of the fields, else a tuple with index of records)
496
497        *Returns* :
498
499        - dict with inconsistent values of the fields
500        - or a tuple with index of records'''
501        f_parent = copy(self.nindex(parent) if isinstance(parent, str)
502                                            else self.lindex[parent])
503        f_field = copy(self.nindex(field) if isinstance(field, str)
504                                          else self.lindex[field])
505        return Cfield.check_relation(f_parent, f_field, typecoupl, value)

get the inconsistent records for a relationship

Parameters

  • field : int or str - index or name of the field involved in the relation
  • parent: int or str - index or name of the second field involved in the relation
  • typecoupl: str - relationship to check ('derived' or 'coupled')
  • value: boolean (default True) - if True return a dict with inconsistent values of the fields, else a tuple with index of records)

Returns :

  • dict with inconsistent values of the fields
  • or a tuple with index of records
def check_relationship(self, relations):
507    def check_relationship(self, relations):
508        '''get the inconsistent records for each relationship defined in relations
509
510         *Parameters*
511
512        - **relations** : list of dict or single dict - list of fields with relationship property
513
514        *Returns* :
515
516        - dict with for each relationship: key = string with the two fields name,
517        and value = list of inconsistent records
518        - or if single relationship : value'''
519        if not isinstance(relations, (list, dict)):
520            raise DatasetError("relations is not correct")
521        if isinstance(relations, dict):
522            relations = [relations]
523        dic_res = {}
524        for field in relations:
525            if not 'relationship' in field or not 'name' in field:
526                continue
527            if not 'parent' in field['relationship'] or not 'link' in field['relationship']:
528                raise DatasetError("relationship is not correct")
529            rel = field['relationship']['link']
530            f_parent = field['relationship']['parent']
531            f_field = field['name']
532            name_rel = f_field + ' - ' + f_parent
533            if self.nindex(f_parent) is None or self.nindex(f_field) is None:
534                raise DatasetError("field's name is not present in data")
535            dic_res[name_rel] = self.check_relation(f_parent, f_field, rel, False)
536        if len(dic_res) == 1:
537            return list(dic_res.values())[0]
538        return dic_res

get the inconsistent records for each relationship defined in relations

Parameters

  • relations : list of dict or single dict - list of fields with relationship property

Returns :

  • dict with for each relationship: key = string with the two fields name, and value = list of inconsistent records
  • or if single relationship : value
class DatasetError(builtins.Exception):
541class DatasetError(Exception):
542    # %% errors
543    ''' Dataset Exception'''
544    # pass

Dataset Exception

Inherited Members
builtins.Exception
Exception
builtins.BaseException
with_traceback