tab-dataset.tab_dataset.cdataset

The cdataset module is part of the tab-dataset package.

It contains the classes DatasetAnalysis, Cdataset for Dataset entities.

For more information, see the user guide or the github repository.

  1# -*- coding: utf-8 -*-
  2"""
  3The `cdataset` module is part of the `tab-dataset` package.
  4
  5It contains the classes `DatasetAnalysis`, `Cdataset` for Dataset entities.
  6
  7For more information, see the 
  8[user guide](https://loco-philippe.github.io/tab-dataset/docs/user_guide.html) 
  9or the [github repository](https://github.com/loco-philippe/tab-dataset).
 10"""
 11from copy import copy
 12
 13from tab_dataset.cfield import Cfield, Cutil
 14
 15from json_ntv import Ntv
 16from json_ntv.ntv_util import NtvUtil, NtvConnector
 17
 18from tab_analysis import AnaDataset, Util
 19
 20
 21class DatasetAnalysis:
 22    '''This class is the Cdataset interface class with the tab_analysis module.'''
 23
 24# %% property
 25    @property
 26    def analysis(self):
 27        '''The analysis attribute is associated to the AnaDataset object'''
 28        if self._analysis is None or self._analysis.hashd != self._hashd:
 29            self._analysis = AnaDataset(self.to_analysis(True))
 30        return self._analysis
 31
 32    @property
 33    def anafields(self):
 34        ''' list of AnaField'''
 35        return self.analysis.fields
 36
 37    @property
 38    def partitions(self):
 39        ''' list of partitions defined with index representation (AnaDataset method)'''
 40        return self.analysis.partitions('index')
 41
 42    @property
 43    def complete(self):
 44        ''' complete property of the dataset (AnaDataset method)'''
 45        return self.analysis.complete
 46
 47    @property
 48    def dimension(self):
 49        ''' dimension of the dataset (AnaDataset method)'''
 50        return self.analysis.dimension
 51
 52    @property
 53    def lvarname(self):
 54        ''' list of variable Field name (AnaDataset method)'''
 55        return Util.view(self.analysis.variable, mode='id')
 56
 57    @property
 58    def primaryname(self):
 59        ''' list of primary name (AnaDataset method)'''
 60        return Util.view(self.analysis.primary, mode='id')
 61
 62    @property
 63    def secondaryname(self):
 64        ''' list of secondary name (AnaDataset method)'''
 65        return Util.view(self.analysis.secondary, mode='id')
 66
 67
 68# %% methods
 69
 70    def indexinfos(self, keys=None):
 71        '''return a dict with infos of each index (AnaDataset method) :
 72            
 73        - num, name, cat, diffdistparent, child, parent, distparent,
 74        crossed, pparent, rateder (struct info)
 75        - lencodec, mincodec, maxcodec, typecodec, ratecodec (base info)
 76
 77        *Parameters*
 78
 79        - **keys** : string, list or tuple (default None) - list of attributes
 80        to returned.
 81        if 'all' or None, all attributes are returned.
 82        if 'struct', only structural attributes are returned.
 83
 84        *Returns* : dict'''
 85        return self.analysis.to_dict(mode='index', keys=keys)
 86
 87    def field_partition(self, partition=None, mode='index'):
 88        '''return a partition dict with the list of primary, secondary, unique
 89        and variable fields (index).
 90
 91         *Parameters*
 92
 93        - **partition** : list (default None) - if None, partition is the first
 94        - **mode** : str (default 'index') - Field representation ('id', 'index')
 95        '''
 96        if not partition and len(self.partitions) > 0:
 97            partition = self.partitions[0]
 98        part = [self.analysis.dfield(fld)
 99                for fld in partition] if partition else None
100        return self.analysis.field_partition(mode=mode, partition=part,
101                                             distributed=True)
102
103    def relation(self, fld1, fld2):
104        '''relationship between two fields (AnaDataset method)'''
105        return self.analysis.get_relation(fld1, fld2)
106
107    def tree(self, mode='derived', width=5, lname=20, string=True):
108        '''return a string with a tree of derived Field (AnaDataset method).
109
110         *Parameters*
111
112        - **lname** : integer (default 20) - length of the names
113        - **width** : integer (default 5) - length of the lines
114        - **string** : boolean (default True) - if True return str else return dict
115        - **mode** : string (default 'derived') - kind of tree :
116            'derived' : derived tree
117            'distance': min distance tree
118            'distomin': min distomin tree
119        '''
120        return self.analysis.tree(mode=mode, width=width, lname=lname, string=string)
121
122    def indicator(self, fullsize=None, size=None):
123        '''generate size indicators: ol (object lightness), ul (unicity level),
124        gain (sizegain)
125
126        *Parameters*
127
128        - **fullsize** : int (default none) - size with full codec
129        - **size** : int (default none) - size with existing codec
130
131        *Returns* : dict'''
132        if not fullsize:
133            fullsize = len(self.to_obj(encoded=True, modecodec='full'))
134        if not size:
135            size = len(self.to_obj(encoded=True))
136        return self.analysis.indicator(fullsize, size)
137
138
139class Cdataset(DatasetAnalysis):
140    # %% magic
141    '''
142    A `Cdataset` is a representation of a tabular data.
143
144    *Attributes (for @property see methods)* :
145
146    - **lindex** : list of Field
147    - **name** : name of the Cdataset
148    - **_analysis** : AnaDataset object
149
150    The methods defined in this class are :
151
152    *constructor (@classmethod)*
153
154    - `Cdataset.ntv`
155    - `Cdataset.from_ntv`
156
157    *dynamic value - module analysis (getters @property)*
158
159    - `DatasetAnalysis.analysis`
160    - `DatasetAnalysis.anafields`
161    - `DatasetAnalysis.lvarname`
162    - `DatasetAnalysis.partitions`
163    - `DatasetAnalysis.primaryname`
164    - `DatasetAnalysis.secondaryname`
165    - `DatasetAnalysis.complete`
166    - `DatasetAnalysis.dimension`
167
168    *selecting - infos methods (module analysis)*
169
170    - `DatasetAnalysis.field_partition`
171    - `DatasetAnalysis.indexinfos`
172    - `DatasetAnalysis.indicator`
173    - `DatasetAnalysis.relation`
174    - `DatasetAnalysis.tree`
175
176    *dynamic value (getters @property)*
177
178    - `Cdataset.keys`
179    - `Cdataset.iindex`
180    - `Cdataset.indexlen`
181    - `Cdataset.lenindex`
182    - `Cdataset.lname`
183    - `Cdataset.lunicname`
184    - `Cdataset.lunicrow`
185    - `Cdataset.tiindex`
186
187    *add - update methods (`observation.dataset_structure.DatasetStructure`)*
188
189    - `Cdataset.add`
190    - `Cdataset.delindex`
191    - `Cdataset.renameindex`
192    - `Cdataset.setname`
193
194    *structure management - methods (`observation.dataset_structure.DatasetStructure`)*
195
196    - `Cdataset.check_relation`
197    - `Cdataset.check_relationship`
198    - `Cdataset.nindex`
199    - `Cdataset.reindex`
200    - `Cdataset.reorder`
201    - `Cdataset.swapindex`
202    - `Cdataset.to_analysis`
203    '''
204    field_class = Cfield
205
206    def __init__(self, listidx=None, name=None, reindex=True):
207        '''
208        Dataset constructor.
209
210        *Parameters*
211
212        - **listidx** :  list (default None) - list of Field data
213        - **name** :  string (default None) - name of the dataset
214        - **reindex** : boolean (default True) - if True, default codec for each Field'''
215
216        if isinstance(listidx, Cdataset):
217            self.lindex = [copy(idx) for idx in listidx.lindex]
218            self.name = name if name else listidx.name
219            self._analysis = listidx._analysis
220            return
221        if listidx.__class__.__name__ == 'DataFrame':
222            lindex = NtvConnector.connector(
223            )['DataFrameConnec'].to_listidx(listidx)[0]
224            listidx = [Cfield(field['codec'], field['name'], field['keys'])
225                       for field in lindex]
226        self.name = name
227        self.lindex = [] if listidx is None else listidx
228        if reindex:
229            self.reindex()
230        self._analysis = None
231        return
232
233    def __repr__(self):
234        '''return classname, number of value and number of indexes'''
235        return self.__class__.__name__ + '[' + str(len(self)) + ', ' + str(self.lenindex) + ']'
236
237    def __str__(self):
238        '''return string format for var and lidx'''
239        stri = ''
240        stri += 'fields :\n'
241        for idx in self.lindex:
242            stri += '    ' + str(idx) + '\n'
243        return stri
244
245    def __len__(self):
246        ''' len of values'''
247        if not self.lindex:
248            return 0
249        return len(self.lindex[0])
250
251    def __contains__(self, item):
252        ''' list of lindex values'''
253        return item in self.lindex
254
255    def __getitem__(self, ind):
256        ''' return value record (value conversion)'''
257        res = [idx[ind] for idx in self.lindex]
258        if len(res) == 1:
259            return res[0]
260        return res
261
262    def __setitem__(self, ind, item):
263        ''' modify the Field values for each Field at the row ind'''
264        if not isinstance(item, list):
265            item = [item]
266        for val, idx in zip(item, self.lindex):
267            idx[ind] = val
268
269    def __delitem__(self, ind):
270        ''' remove all Field item at the row ind'''
271        for idx in self.lindex:
272            del idx[ind]
273
274    def __hash__(self):
275        '''return hash of all hash(Field)'''
276        #return hash(tuple(hash(idx) for idx in self.lindex))
277        return sum(hash(idx) for idx in self.lindex)
278
279    def __eq__(self, other):
280        ''' equal if hash values are equal'''
281        return hash(self) == hash(other)
282
283    def __copy__(self):
284        ''' Copy all the data '''
285        return self.__class__(self)
286
287# %% property
288    @property
289    def _hashd(self):
290        '''return hash of all hashf(Field)'''
291        # return sum([idx._hashi() for idx in self.lindex])
292        return hash(tuple(fld.hashf for fld in self.lindex))
293
294    @property
295    def indexlen(self):
296        ''' list of index codec length'''
297        return [len(idx.codec) for idx in self.lindex]
298
299    @property
300    def iindex(self):
301        ''' list of keys for each index'''
302        return [idx.keys for idx in self.lindex]
303
304    @property
305    def keys(self):
306        ''' list of keys for each index'''
307        return [idx.keys for idx in self.lindex]
308
309    @property
310    def lenindex(self):
311        ''' number of indexes'''
312        return len(self.lindex)
313
314    @property
315    def lunicname(self):
316        ''' list of unique index name'''
317        return [idx.name for idx in self.lindex if len(idx.codec) == 1]
318
319    @property
320    def lunicrow(self):
321        '''list of unic idx row'''
322        return [self.lname.index(name) for name in self.lunicname]
323
324    @property
325    def lname(self):
326        ''' list of index name'''
327        return [idx.name for idx in self.lindex]
328
329    @property
330    def tiindex(self):
331        ''' list of keys for each record'''
332        return Cutil.list(list(zip(*self.iindex)))
333
334# %%methods
335
336    @classmethod
337    def ntv(cls, ntv_value, reindex=True, fast=False):
338        '''Generate an Dataset Object from a ntv_value
339
340        *Parameters*
341
342        - **ntv_value** : bytes, string, Ntv object to convert
343        - **reindex** : boolean (default True) - if True, default codec for each Field
344        - **fast** : boolean (default False) - if True, ntv_value are not converted in json-value'''
345        return cls.from_ntv(ntv_value, reindex=reindex, fast=fast)
346
347    @classmethod
348    def from_ntv(cls, ntv_value, reindex=True, decode_str=False, fast=False):
349        '''Generate a Dataset Object from a ntv_value
350
351        *Parameters*
352
353        - **ntv_value** : bytes, string, Ntv object to convert
354        - **reindex** : boolean (default True) - if True, default codec for each Field
355        - **decode_str**: boolean (default False) - if True, string are loaded in json data
356        - **fast** : boolean (default False) - if True, ntv_value are not converted in json-value'''
357        ntv = Ntv.obj(ntv_value, decode_str=decode_str, fast=fast)
358        if len(ntv) == 0:
359            return cls()
360        lidx = [list(NtvUtil.decode_ntv_tab(
361            ntvf, cls.field_class.ntv_to_val)) for ntvf in ntv]
362        leng = max(idx[6] for idx in lidx)
363        for ind in range(len(lidx)):
364            if lidx[ind][0] == '':
365                lidx[ind][0] = 'i'+str(ind)
366            NtvConnector.init_ntv_keys(ind, lidx, leng)
367        lindex = [cls.field_class(idx[2], idx[0], idx[4], None,  # idx[1] pour le type,
368                                  reindex=reindex) for idx in lidx]
369        return cls(lindex, reindex=reindex, name=ntv.name)
370
371    def add(self, other, name=False, solve=True):
372        ''' Add other's values to self's values for each index
373
374        *Parameters*
375
376        - **other** : Dataset object to add to self object
377        - **name** : Boolean (default False) - Add values with same index name (True) or
378        same index row (False)
379        - **solve** : Boolean (default True) - If True, replace None other's codec value
380        with self codec value.
381
382        *Returns* : self '''
383        if self.lenindex != other.lenindex:
384            raise DatasetError('length are not identical')
385        if name and sorted(self.lname) != sorted(other.lname):
386            raise DatasetError('name are not identical')
387        for i in range(self.lenindex):
388            if name:
389                self.lindex[i].add(other.lindex[other.lname.index(self.lname[i])],
390                                   solve=solve)
391            else:
392                self.lindex[i].add(other.lindex[i], solve=solve)
393        return self
394
395    def to_analysis(self, distr=False):
396        '''return a dict with data used in AnaDataset module
397
398        *Parameters*
399
400        - **distr** : Boolean (default False) - If True, add distr information'''
401        return {'name': self.name, 'fields': [fld.to_analysis for fld in self.lindex],
402                'length': len(self), 'hashd': self._hashd,
403                'relations': {self.lindex[i].name:
404                              {self.lindex[j].name: Cutil.dist(
405                                  self.lindex[i].keys, self.lindex[j].keys, distr)
406                               for j in range(i+1, len(self.lindex))}
407                              for i in range(len(self.lindex)-1)}
408                }
409
410    def reindex(self):
411        '''Calculate a new default codec for each index (Return self)'''
412        for idx in self.lindex:
413            idx.reindex()
414        return self
415
416    def delindex(self, delname=None, savename=None):
417        '''remove an Field or a list of Field.
418
419        *Parameters*
420
421        - **delname** : string or list of string - name of index to remove
422        - **savename** : string or list of string - name of index to keep
423
424        *Returns* : none '''
425        if not delname and not savename:
426            return
427        if isinstance(delname, str):
428            delname = [delname]
429        if isinstance(savename, str):
430            savename = [savename]
431        if delname and savename:
432            delname = [name for name in delname if not name in savename]
433        if not delname:
434            delname = [name for name in self.lname if not name in savename]
435        for idxname in delname:
436            if idxname in self.lname:
437                self.lindex.pop(self.lname.index(idxname))
438
439    def nindex(self, name):
440        ''' index with name equal to attribute name'''
441        if name in self.lname:
442            return self.lindex[self.lname.index(name)]
443        return None
444
445    def renameindex(self, oldname, newname):
446        '''replace an index name 'oldname' by a new one 'newname'. '''
447        for i in range(self.lenindex):
448            if self.lname[i] == oldname:
449                self.lindex[i].setname(newname)
450        for i in range(len(self.lvarname)):
451            if self.lvarname[i] == oldname:
452                self.lvarname[i] = newname
453
454    def reorder(self, recorder=None):
455        '''Reorder records in the order define by 'recorder' '''
456        if recorder is None or set(recorder) != set(range(len(self))):
457            return None
458        for idx in self.lindex:
459            idx.set_keys([idx.keys[i] for i in recorder])
460        return None
461
462    def setname(self, listname=None):
463        '''Update Field name by the name in listname'''
464        for i in range(min(self.lenindex, len(listname))):
465            self.lindex[i].name = listname[i]
466
467    def swapindex(self, order):
468        '''
469        Change the order of the index .
470
471        *Parameters*
472
473        - **order** : list of int or list of name - new order of index to apply.
474
475        *Returns* : self '''
476        if self.lenindex != len(order):
477            raise DatasetError('length of order and Dataset different')
478        if not order or isinstance(order[0], int):
479            self.lindex = [self.lindex[ind] for ind in order]
480        elif isinstance(order[0], str):
481            self.lindex = [self.nindex(name) for name in order]
482        return self
483
484    def check_relation(self, field, parent, typecoupl, value=True):
485        '''get the inconsistent records for a relationship
486
487         *Parameters*
488
489        - **field** : int or str - index or name of the field involved in the relation
490        - **parent**: int or str - index or name of the second field involved in the relation
491        - **typecoupl**: str - relationship to check ('derived' or 'coupled')
492        - **value**: boolean (default True) - if True return a dict with inconsistent
493        values of the fields, else a tuple with index of records)
494
495        *Returns* :
496
497        - dict with inconsistent values of the fields
498        - or a tuple with index of records'''
499        f_parent = copy(self.nindex(parent) if isinstance(parent, str)
500                                            else self.lindex[parent])
501        f_field = copy(self.nindex(field) if isinstance(field, str)
502                                          else self.lindex[field])
503        match typecoupl:
504            case 'derived':
505                errors = f_parent.coupling(f_field, reindex=True)
506            case 'coupled':
507                errors = copy(f_parent).coupling(
508                    f_field, derived=False, reindex=True)
509            case _:
510                raise DatasetError(typecoupl + "is not a valid relationship")
511        if not value:
512            return errors
513        return {'row': list(errors), f_field.name: f_field[errors], f_parent.name: f_parent[errors]}
514
515    def check_relationship(self, relations):
516        '''get the inconsistent records for each relationship defined in relations
517
518         *Parameters*
519
520        - **relations** : list of dict or single dict - list of fields with relationship property
521
522        *Returns* :
523
524        - dict with for each relationship: key = string with the two fields name,
525        and value = list of inconsistent records
526        - or if single relationship : value'''
527        if not isinstance(relations, (list, dict)):
528            raise DatasetError("relations is not correct")
529        if isinstance(relations, dict):
530            relations = [relations]
531        dic_res = {}
532        for field in relations:
533            if not 'relationship' in field or not 'name' in field:
534                continue
535            if not 'parent' in field['relationship'] or not 'link' in field['relationship']:
536                raise DatasetError("relationship is not correct")
537            rel = field['relationship']['link']
538            f_parent = field['relationship']['parent']
539            f_field = field['name']
540            name_rel = f_field + ' - ' + f_parent
541            if self.nindex(f_parent) is None or self.nindex(f_field) is None:
542                raise DatasetError("field's name is not present in data")
543            dic_res[name_rel] = self.check_relation(
544                f_field, f_parent, rel, False)
545        if len(dic_res) == 1:
546            return list(dic_res.values())[0]
547        return dic_res
548
549
550class DatasetError(Exception):
551    # %% errors
552    ''' Dataset Exception'''
553    # pass
class DatasetAnalysis:
 22class DatasetAnalysis:
 23    '''This class is the Cdataset interface class with the tab_analysis module.'''
 24
 25# %% property
 26    @property
 27    def analysis(self):
 28        '''The analysis attribute is associated to the AnaDataset object'''
 29        if self._analysis is None or self._analysis.hashd != self._hashd:
 30            self._analysis = AnaDataset(self.to_analysis(True))
 31        return self._analysis
 32
 33    @property
 34    def anafields(self):
 35        ''' list of AnaField'''
 36        return self.analysis.fields
 37
 38    @property
 39    def partitions(self):
 40        ''' list of partitions defined with index representation (AnaDataset method)'''
 41        return self.analysis.partitions('index')
 42
 43    @property
 44    def complete(self):
 45        ''' complete property of the dataset (AnaDataset method)'''
 46        return self.analysis.complete
 47
 48    @property
 49    def dimension(self):
 50        ''' dimension of the dataset (AnaDataset method)'''
 51        return self.analysis.dimension
 52
 53    @property
 54    def lvarname(self):
 55        ''' list of variable Field name (AnaDataset method)'''
 56        return Util.view(self.analysis.variable, mode='id')
 57
 58    @property
 59    def primaryname(self):
 60        ''' list of primary name (AnaDataset method)'''
 61        return Util.view(self.analysis.primary, mode='id')
 62
 63    @property
 64    def secondaryname(self):
 65        ''' list of secondary name (AnaDataset method)'''
 66        return Util.view(self.analysis.secondary, mode='id')
 67
 68
 69# %% methods
 70
 71    def indexinfos(self, keys=None):
 72        '''return a dict with infos of each index (AnaDataset method) :
 73            
 74        - num, name, cat, diffdistparent, child, parent, distparent,
 75        crossed, pparent, rateder (struct info)
 76        - lencodec, mincodec, maxcodec, typecodec, ratecodec (base info)
 77
 78        *Parameters*
 79
 80        - **keys** : string, list or tuple (default None) - list of attributes
 81        to returned.
 82        if 'all' or None, all attributes are returned.
 83        if 'struct', only structural attributes are returned.
 84
 85        *Returns* : dict'''
 86        return self.analysis.to_dict(mode='index', keys=keys)
 87
 88    def field_partition(self, partition=None, mode='index'):
 89        '''return a partition dict with the list of primary, secondary, unique
 90        and variable fields (index).
 91
 92         *Parameters*
 93
 94        - **partition** : list (default None) - if None, partition is the first
 95        - **mode** : str (default 'index') - Field representation ('id', 'index')
 96        '''
 97        if not partition and len(self.partitions) > 0:
 98            partition = self.partitions[0]
 99        part = [self.analysis.dfield(fld)
100                for fld in partition] if partition else None
101        return self.analysis.field_partition(mode=mode, partition=part,
102                                             distributed=True)
103
104    def relation(self, fld1, fld2):
105        '''relationship between two fields (AnaDataset method)'''
106        return self.analysis.get_relation(fld1, fld2)
107
108    def tree(self, mode='derived', width=5, lname=20, string=True):
109        '''return a string with a tree of derived Field (AnaDataset method).
110
111         *Parameters*
112
113        - **lname** : integer (default 20) - length of the names
114        - **width** : integer (default 5) - length of the lines
115        - **string** : boolean (default True) - if True return str else return dict
116        - **mode** : string (default 'derived') - kind of tree :
117            'derived' : derived tree
118            'distance': min distance tree
119            'distomin': min distomin tree
120        '''
121        return self.analysis.tree(mode=mode, width=width, lname=lname, string=string)
122
123    def indicator(self, fullsize=None, size=None):
124        '''generate size indicators: ol (object lightness), ul (unicity level),
125        gain (sizegain)
126
127        *Parameters*
128
129        - **fullsize** : int (default none) - size with full codec
130        - **size** : int (default none) - size with existing codec
131
132        *Returns* : dict'''
133        if not fullsize:
134            fullsize = len(self.to_obj(encoded=True, modecodec='full'))
135        if not size:
136            size = len(self.to_obj(encoded=True))
137        return self.analysis.indicator(fullsize, size)

This class is the Cdataset interface class with the tab_analysis module.

analysis

The analysis attribute is associated to the AnaDataset object

anafields

list of AnaField

partitions

list of partitions defined with index representation (AnaDataset method)

complete

complete property of the dataset (AnaDataset method)

dimension

dimension of the dataset (AnaDataset method)

lvarname

list of variable Field name (AnaDataset method)

primaryname

list of primary name (AnaDataset method)

secondaryname

list of secondary name (AnaDataset method)

def indexinfos(self, keys=None):
71    def indexinfos(self, keys=None):
72        '''return a dict with infos of each index (AnaDataset method) :
73            
74        - num, name, cat, diffdistparent, child, parent, distparent,
75        crossed, pparent, rateder (struct info)
76        - lencodec, mincodec, maxcodec, typecodec, ratecodec (base info)
77
78        *Parameters*
79
80        - **keys** : string, list or tuple (default None) - list of attributes
81        to returned.
82        if 'all' or None, all attributes are returned.
83        if 'struct', only structural attributes are returned.
84
85        *Returns* : dict'''
86        return self.analysis.to_dict(mode='index', keys=keys)

return a dict with infos of each index (AnaDataset method) :

  • num, name, cat, diffdistparent, child, parent, distparent, crossed, pparent, rateder (struct info)
  • lencodec, mincodec, maxcodec, typecodec, ratecodec (base info)

Parameters

  • keys : string, list or tuple (default None) - list of attributes to returned. if 'all' or None, all attributes are returned. if 'struct', only structural attributes are returned.

Returns : dict

def field_partition(self, partition=None, mode='index'):
 88    def field_partition(self, partition=None, mode='index'):
 89        '''return a partition dict with the list of primary, secondary, unique
 90        and variable fields (index).
 91
 92         *Parameters*
 93
 94        - **partition** : list (default None) - if None, partition is the first
 95        - **mode** : str (default 'index') - Field representation ('id', 'index')
 96        '''
 97        if not partition and len(self.partitions) > 0:
 98            partition = self.partitions[0]
 99        part = [self.analysis.dfield(fld)
100                for fld in partition] if partition else None
101        return self.analysis.field_partition(mode=mode, partition=part,
102                                             distributed=True)

return a partition dict with the list of primary, secondary, unique and variable fields (index).

Parameters

  • partition : list (default None) - if None, partition is the first
  • mode : str (default 'index') - Field representation ('id', 'index')
def relation(self, fld1, fld2):
104    def relation(self, fld1, fld2):
105        '''relationship between two fields (AnaDataset method)'''
106        return self.analysis.get_relation(fld1, fld2)

relationship between two fields (AnaDataset method)

def tree(self, mode='derived', width=5, lname=20, string=True):
108    def tree(self, mode='derived', width=5, lname=20, string=True):
109        '''return a string with a tree of derived Field (AnaDataset method).
110
111         *Parameters*
112
113        - **lname** : integer (default 20) - length of the names
114        - **width** : integer (default 5) - length of the lines
115        - **string** : boolean (default True) - if True return str else return dict
116        - **mode** : string (default 'derived') - kind of tree :
117            'derived' : derived tree
118            'distance': min distance tree
119            'distomin': min distomin tree
120        '''
121        return self.analysis.tree(mode=mode, width=width, lname=lname, string=string)

return a string with a tree of derived Field (AnaDataset method).

Parameters

  • lname : integer (default 20) - length of the names
  • width : integer (default 5) - length of the lines
  • string : boolean (default True) - if True return str else return dict
  • mode : string (default 'derived') - kind of tree : 'derived' : derived tree 'distance': min distance tree 'distomin': min distomin tree
def indicator(self, fullsize=None, size=None):
123    def indicator(self, fullsize=None, size=None):
124        '''generate size indicators: ol (object lightness), ul (unicity level),
125        gain (sizegain)
126
127        *Parameters*
128
129        - **fullsize** : int (default none) - size with full codec
130        - **size** : int (default none) - size with existing codec
131
132        *Returns* : dict'''
133        if not fullsize:
134            fullsize = len(self.to_obj(encoded=True, modecodec='full'))
135        if not size:
136            size = len(self.to_obj(encoded=True))
137        return self.analysis.indicator(fullsize, size)

generate size indicators: ol (object lightness), ul (unicity level), gain (sizegain)

Parameters

  • fullsize : int (default none) - size with full codec
  • size : int (default none) - size with existing codec

Returns : dict

class Cdataset(DatasetAnalysis):
140class Cdataset(DatasetAnalysis):
141    # %% magic
142    '''
143    A `Cdataset` is a representation of a tabular data.
144
145    *Attributes (for @property see methods)* :
146
147    - **lindex** : list of Field
148    - **name** : name of the Cdataset
149    - **_analysis** : AnaDataset object
150
151    The methods defined in this class are :
152
153    *constructor (@classmethod)*
154
155    - `Cdataset.ntv`
156    - `Cdataset.from_ntv`
157
158    *dynamic value - module analysis (getters @property)*
159
160    - `DatasetAnalysis.analysis`
161    - `DatasetAnalysis.anafields`
162    - `DatasetAnalysis.lvarname`
163    - `DatasetAnalysis.partitions`
164    - `DatasetAnalysis.primaryname`
165    - `DatasetAnalysis.secondaryname`
166    - `DatasetAnalysis.complete`
167    - `DatasetAnalysis.dimension`
168
169    *selecting - infos methods (module analysis)*
170
171    - `DatasetAnalysis.field_partition`
172    - `DatasetAnalysis.indexinfos`
173    - `DatasetAnalysis.indicator`
174    - `DatasetAnalysis.relation`
175    - `DatasetAnalysis.tree`
176
177    *dynamic value (getters @property)*
178
179    - `Cdataset.keys`
180    - `Cdataset.iindex`
181    - `Cdataset.indexlen`
182    - `Cdataset.lenindex`
183    - `Cdataset.lname`
184    - `Cdataset.lunicname`
185    - `Cdataset.lunicrow`
186    - `Cdataset.tiindex`
187
188    *add - update methods (`observation.dataset_structure.DatasetStructure`)*
189
190    - `Cdataset.add`
191    - `Cdataset.delindex`
192    - `Cdataset.renameindex`
193    - `Cdataset.setname`
194
195    *structure management - methods (`observation.dataset_structure.DatasetStructure`)*
196
197    - `Cdataset.check_relation`
198    - `Cdataset.check_relationship`
199    - `Cdataset.nindex`
200    - `Cdataset.reindex`
201    - `Cdataset.reorder`
202    - `Cdataset.swapindex`
203    - `Cdataset.to_analysis`
204    '''
205    field_class = Cfield
206
207    def __init__(self, listidx=None, name=None, reindex=True):
208        '''
209        Dataset constructor.
210
211        *Parameters*
212
213        - **listidx** :  list (default None) - list of Field data
214        - **name** :  string (default None) - name of the dataset
215        - **reindex** : boolean (default True) - if True, default codec for each Field'''
216
217        if isinstance(listidx, Cdataset):
218            self.lindex = [copy(idx) for idx in listidx.lindex]
219            self.name = name if name else listidx.name
220            self._analysis = listidx._analysis
221            return
222        if listidx.__class__.__name__ == 'DataFrame':
223            lindex = NtvConnector.connector(
224            )['DataFrameConnec'].to_listidx(listidx)[0]
225            listidx = [Cfield(field['codec'], field['name'], field['keys'])
226                       for field in lindex]
227        self.name = name
228        self.lindex = [] if listidx is None else listidx
229        if reindex:
230            self.reindex()
231        self._analysis = None
232        return
233
234    def __repr__(self):
235        '''return classname, number of value and number of indexes'''
236        return self.__class__.__name__ + '[' + str(len(self)) + ', ' + str(self.lenindex) + ']'
237
238    def __str__(self):
239        '''return string format for var and lidx'''
240        stri = ''
241        stri += 'fields :\n'
242        for idx in self.lindex:
243            stri += '    ' + str(idx) + '\n'
244        return stri
245
246    def __len__(self):
247        ''' len of values'''
248        if not self.lindex:
249            return 0
250        return len(self.lindex[0])
251
252    def __contains__(self, item):
253        ''' list of lindex values'''
254        return item in self.lindex
255
256    def __getitem__(self, ind):
257        ''' return value record (value conversion)'''
258        res = [idx[ind] for idx in self.lindex]
259        if len(res) == 1:
260            return res[0]
261        return res
262
263    def __setitem__(self, ind, item):
264        ''' modify the Field values for each Field at the row ind'''
265        if not isinstance(item, list):
266            item = [item]
267        for val, idx in zip(item, self.lindex):
268            idx[ind] = val
269
270    def __delitem__(self, ind):
271        ''' remove all Field item at the row ind'''
272        for idx in self.lindex:
273            del idx[ind]
274
275    def __hash__(self):
276        '''return hash of all hash(Field)'''
277        #return hash(tuple(hash(idx) for idx in self.lindex))
278        return sum(hash(idx) for idx in self.lindex)
279
280    def __eq__(self, other):
281        ''' equal if hash values are equal'''
282        return hash(self) == hash(other)
283
284    def __copy__(self):
285        ''' Copy all the data '''
286        return self.__class__(self)
287
288# %% property
289    @property
290    def _hashd(self):
291        '''return hash of all hashf(Field)'''
292        # return sum([idx._hashi() for idx in self.lindex])
293        return hash(tuple(fld.hashf for fld in self.lindex))
294
295    @property
296    def indexlen(self):
297        ''' list of index codec length'''
298        return [len(idx.codec) for idx in self.lindex]
299
300    @property
301    def iindex(self):
302        ''' list of keys for each index'''
303        return [idx.keys for idx in self.lindex]
304
305    @property
306    def keys(self):
307        ''' list of keys for each index'''
308        return [idx.keys for idx in self.lindex]
309
310    @property
311    def lenindex(self):
312        ''' number of indexes'''
313        return len(self.lindex)
314
315    @property
316    def lunicname(self):
317        ''' list of unique index name'''
318        return [idx.name for idx in self.lindex if len(idx.codec) == 1]
319
320    @property
321    def lunicrow(self):
322        '''list of unic idx row'''
323        return [self.lname.index(name) for name in self.lunicname]
324
325    @property
326    def lname(self):
327        ''' list of index name'''
328        return [idx.name for idx in self.lindex]
329
330    @property
331    def tiindex(self):
332        ''' list of keys for each record'''
333        return Cutil.list(list(zip(*self.iindex)))
334
335# %%methods
336
337    @classmethod
338    def ntv(cls, ntv_value, reindex=True, fast=False):
339        '''Generate an Dataset Object from a ntv_value
340
341        *Parameters*
342
343        - **ntv_value** : bytes, string, Ntv object to convert
344        - **reindex** : boolean (default True) - if True, default codec for each Field
345        - **fast** : boolean (default False) - if True, ntv_value are not converted in json-value'''
346        return cls.from_ntv(ntv_value, reindex=reindex, fast=fast)
347
348    @classmethod
349    def from_ntv(cls, ntv_value, reindex=True, decode_str=False, fast=False):
350        '''Generate a Dataset Object from a ntv_value
351
352        *Parameters*
353
354        - **ntv_value** : bytes, string, Ntv object to convert
355        - **reindex** : boolean (default True) - if True, default codec for each Field
356        - **decode_str**: boolean (default False) - if True, string are loaded in json data
357        - **fast** : boolean (default False) - if True, ntv_value are not converted in json-value'''
358        ntv = Ntv.obj(ntv_value, decode_str=decode_str, fast=fast)
359        if len(ntv) == 0:
360            return cls()
361        lidx = [list(NtvUtil.decode_ntv_tab(
362            ntvf, cls.field_class.ntv_to_val)) for ntvf in ntv]
363        leng = max(idx[6] for idx in lidx)
364        for ind in range(len(lidx)):
365            if lidx[ind][0] == '':
366                lidx[ind][0] = 'i'+str(ind)
367            NtvConnector.init_ntv_keys(ind, lidx, leng)
368        lindex = [cls.field_class(idx[2], idx[0], idx[4], None,  # idx[1] pour le type,
369                                  reindex=reindex) for idx in lidx]
370        return cls(lindex, reindex=reindex, name=ntv.name)
371
372    def add(self, other, name=False, solve=True):
373        ''' Add other's values to self's values for each index
374
375        *Parameters*
376
377        - **other** : Dataset object to add to self object
378        - **name** : Boolean (default False) - Add values with same index name (True) or
379        same index row (False)
380        - **solve** : Boolean (default True) - If True, replace None other's codec value
381        with self codec value.
382
383        *Returns* : self '''
384        if self.lenindex != other.lenindex:
385            raise DatasetError('length are not identical')
386        if name and sorted(self.lname) != sorted(other.lname):
387            raise DatasetError('name are not identical')
388        for i in range(self.lenindex):
389            if name:
390                self.lindex[i].add(other.lindex[other.lname.index(self.lname[i])],
391                                   solve=solve)
392            else:
393                self.lindex[i].add(other.lindex[i], solve=solve)
394        return self
395
396    def to_analysis(self, distr=False):
397        '''return a dict with data used in AnaDataset module
398
399        *Parameters*
400
401        - **distr** : Boolean (default False) - If True, add distr information'''
402        return {'name': self.name, 'fields': [fld.to_analysis for fld in self.lindex],
403                'length': len(self), 'hashd': self._hashd,
404                'relations': {self.lindex[i].name:
405                              {self.lindex[j].name: Cutil.dist(
406                                  self.lindex[i].keys, self.lindex[j].keys, distr)
407                               for j in range(i+1, len(self.lindex))}
408                              for i in range(len(self.lindex)-1)}
409                }
410
411    def reindex(self):
412        '''Calculate a new default codec for each index (Return self)'''
413        for idx in self.lindex:
414            idx.reindex()
415        return self
416
417    def delindex(self, delname=None, savename=None):
418        '''remove an Field or a list of Field.
419
420        *Parameters*
421
422        - **delname** : string or list of string - name of index to remove
423        - **savename** : string or list of string - name of index to keep
424
425        *Returns* : none '''
426        if not delname and not savename:
427            return
428        if isinstance(delname, str):
429            delname = [delname]
430        if isinstance(savename, str):
431            savename = [savename]
432        if delname and savename:
433            delname = [name for name in delname if not name in savename]
434        if not delname:
435            delname = [name for name in self.lname if not name in savename]
436        for idxname in delname:
437            if idxname in self.lname:
438                self.lindex.pop(self.lname.index(idxname))
439
440    def nindex(self, name):
441        ''' index with name equal to attribute name'''
442        if name in self.lname:
443            return self.lindex[self.lname.index(name)]
444        return None
445
446    def renameindex(self, oldname, newname):
447        '''replace an index name 'oldname' by a new one 'newname'. '''
448        for i in range(self.lenindex):
449            if self.lname[i] == oldname:
450                self.lindex[i].setname(newname)
451        for i in range(len(self.lvarname)):
452            if self.lvarname[i] == oldname:
453                self.lvarname[i] = newname
454
455    def reorder(self, recorder=None):
456        '''Reorder records in the order define by 'recorder' '''
457        if recorder is None or set(recorder) != set(range(len(self))):
458            return None
459        for idx in self.lindex:
460            idx.set_keys([idx.keys[i] for i in recorder])
461        return None
462
463    def setname(self, listname=None):
464        '''Update Field name by the name in listname'''
465        for i in range(min(self.lenindex, len(listname))):
466            self.lindex[i].name = listname[i]
467
468    def swapindex(self, order):
469        '''
470        Change the order of the index .
471
472        *Parameters*
473
474        - **order** : list of int or list of name - new order of index to apply.
475
476        *Returns* : self '''
477        if self.lenindex != len(order):
478            raise DatasetError('length of order and Dataset different')
479        if not order or isinstance(order[0], int):
480            self.lindex = [self.lindex[ind] for ind in order]
481        elif isinstance(order[0], str):
482            self.lindex = [self.nindex(name) for name in order]
483        return self
484
485    def check_relation(self, field, parent, typecoupl, value=True):
486        '''get the inconsistent records for a relationship
487
488         *Parameters*
489
490        - **field** : int or str - index or name of the field involved in the relation
491        - **parent**: int or str - index or name of the second field involved in the relation
492        - **typecoupl**: str - relationship to check ('derived' or 'coupled')
493        - **value**: boolean (default True) - if True return a dict with inconsistent
494        values of the fields, else a tuple with index of records)
495
496        *Returns* :
497
498        - dict with inconsistent values of the fields
499        - or a tuple with index of records'''
500        f_parent = copy(self.nindex(parent) if isinstance(parent, str)
501                                            else self.lindex[parent])
502        f_field = copy(self.nindex(field) if isinstance(field, str)
503                                          else self.lindex[field])
504        match typecoupl:
505            case 'derived':
506                errors = f_parent.coupling(f_field, reindex=True)
507            case 'coupled':
508                errors = copy(f_parent).coupling(
509                    f_field, derived=False, reindex=True)
510            case _:
511                raise DatasetError(typecoupl + "is not a valid relationship")
512        if not value:
513            return errors
514        return {'row': list(errors), f_field.name: f_field[errors], f_parent.name: f_parent[errors]}
515
516    def check_relationship(self, relations):
517        '''get the inconsistent records for each relationship defined in relations
518
519         *Parameters*
520
521        - **relations** : list of dict or single dict - list of fields with relationship property
522
523        *Returns* :
524
525        - dict with for each relationship: key = string with the two fields name,
526        and value = list of inconsistent records
527        - or if single relationship : value'''
528        if not isinstance(relations, (list, dict)):
529            raise DatasetError("relations is not correct")
530        if isinstance(relations, dict):
531            relations = [relations]
532        dic_res = {}
533        for field in relations:
534            if not 'relationship' in field or not 'name' in field:
535                continue
536            if not 'parent' in field['relationship'] or not 'link' in field['relationship']:
537                raise DatasetError("relationship is not correct")
538            rel = field['relationship']['link']
539            f_parent = field['relationship']['parent']
540            f_field = field['name']
541            name_rel = f_field + ' - ' + f_parent
542            if self.nindex(f_parent) is None or self.nindex(f_field) is None:
543                raise DatasetError("field's name is not present in data")
544            dic_res[name_rel] = self.check_relation(
545                f_field, f_parent, rel, False)
546        if len(dic_res) == 1:
547            return list(dic_res.values())[0]
548        return dic_res

A Cdataset is a representation of a tabular data.

Attributes (for @property see methods) :

  • lindex : list of Field
  • name : name of the Cdataset
  • _analysis : AnaDataset object

The methods defined in this class are :

constructor (@classmethod)

dynamic value - module analysis (getters @property)

selecting - infos methods (module analysis)

dynamic value (getters @property)

add - update methods (observation.dataset_structure.DatasetStructure)

structure management - methods (observation.dataset_structure.DatasetStructure)

Cdataset(listidx=None, name=None, reindex=True)
207    def __init__(self, listidx=None, name=None, reindex=True):
208        '''
209        Dataset constructor.
210
211        *Parameters*
212
213        - **listidx** :  list (default None) - list of Field data
214        - **name** :  string (default None) - name of the dataset
215        - **reindex** : boolean (default True) - if True, default codec for each Field'''
216
217        if isinstance(listidx, Cdataset):
218            self.lindex = [copy(idx) for idx in listidx.lindex]
219            self.name = name if name else listidx.name
220            self._analysis = listidx._analysis
221            return
222        if listidx.__class__.__name__ == 'DataFrame':
223            lindex = NtvConnector.connector(
224            )['DataFrameConnec'].to_listidx(listidx)[0]
225            listidx = [Cfield(field['codec'], field['name'], field['keys'])
226                       for field in lindex]
227        self.name = name
228        self.lindex = [] if listidx is None else listidx
229        if reindex:
230            self.reindex()
231        self._analysis = None
232        return

Dataset constructor.

Parameters

  • listidx : list (default None) - list of Field data
  • name : string (default None) - name of the dataset
  • reindex : boolean (default True) - if True, default codec for each Field
indexlen

list of index codec length

iindex

list of keys for each index

keys

list of keys for each index

lenindex

number of indexes

lunicname

list of unique index name

lunicrow

list of unic idx row

lname

list of index name

tiindex

list of keys for each record

@classmethod
def ntv(cls, ntv_value, reindex=True, fast=False):
337    @classmethod
338    def ntv(cls, ntv_value, reindex=True, fast=False):
339        '''Generate an Dataset Object from a ntv_value
340
341        *Parameters*
342
343        - **ntv_value** : bytes, string, Ntv object to convert
344        - **reindex** : boolean (default True) - if True, default codec for each Field
345        - **fast** : boolean (default False) - if True, ntv_value are not converted in json-value'''
346        return cls.from_ntv(ntv_value, reindex=reindex, fast=fast)

Generate an Dataset Object from a ntv_value

Parameters

  • ntv_value : bytes, string, Ntv object to convert
  • reindex : boolean (default True) - if True, default codec for each Field
  • fast : boolean (default False) - if True, ntv_value are not converted in json-value
@classmethod
def from_ntv(cls, ntv_value, reindex=True, decode_str=False, fast=False):
348    @classmethod
349    def from_ntv(cls, ntv_value, reindex=True, decode_str=False, fast=False):
350        '''Generate a Dataset Object from a ntv_value
351
352        *Parameters*
353
354        - **ntv_value** : bytes, string, Ntv object to convert
355        - **reindex** : boolean (default True) - if True, default codec for each Field
356        - **decode_str**: boolean (default False) - if True, string are loaded in json data
357        - **fast** : boolean (default False) - if True, ntv_value are not converted in json-value'''
358        ntv = Ntv.obj(ntv_value, decode_str=decode_str, fast=fast)
359        if len(ntv) == 0:
360            return cls()
361        lidx = [list(NtvUtil.decode_ntv_tab(
362            ntvf, cls.field_class.ntv_to_val)) for ntvf in ntv]
363        leng = max(idx[6] for idx in lidx)
364        for ind in range(len(lidx)):
365            if lidx[ind][0] == '':
366                lidx[ind][0] = 'i'+str(ind)
367            NtvConnector.init_ntv_keys(ind, lidx, leng)
368        lindex = [cls.field_class(idx[2], idx[0], idx[4], None,  # idx[1] pour le type,
369                                  reindex=reindex) for idx in lidx]
370        return cls(lindex, reindex=reindex, name=ntv.name)

Generate a Dataset Object from a ntv_value

Parameters

  • ntv_value : bytes, string, Ntv object to convert
  • reindex : boolean (default True) - if True, default codec for each Field
  • decode_str: boolean (default False) - if True, string are loaded in json data
  • fast : boolean (default False) - if True, ntv_value are not converted in json-value
def add(self, other, name=False, solve=True):
372    def add(self, other, name=False, solve=True):
373        ''' Add other's values to self's values for each index
374
375        *Parameters*
376
377        - **other** : Dataset object to add to self object
378        - **name** : Boolean (default False) - Add values with same index name (True) or
379        same index row (False)
380        - **solve** : Boolean (default True) - If True, replace None other's codec value
381        with self codec value.
382
383        *Returns* : self '''
384        if self.lenindex != other.lenindex:
385            raise DatasetError('length are not identical')
386        if name and sorted(self.lname) != sorted(other.lname):
387            raise DatasetError('name are not identical')
388        for i in range(self.lenindex):
389            if name:
390                self.lindex[i].add(other.lindex[other.lname.index(self.lname[i])],
391                                   solve=solve)
392            else:
393                self.lindex[i].add(other.lindex[i], solve=solve)
394        return self

Add other's values to self's values for each index

Parameters

  • other : Dataset object to add to self object
  • name : Boolean (default False) - Add values with same index name (True) or same index row (False)
  • solve : Boolean (default True) - If True, replace None other's codec value with self codec value.

Returns : self

def to_analysis(self, distr=False):
396    def to_analysis(self, distr=False):
397        '''return a dict with data used in AnaDataset module
398
399        *Parameters*
400
401        - **distr** : Boolean (default False) - If True, add distr information'''
402        return {'name': self.name, 'fields': [fld.to_analysis for fld in self.lindex],
403                'length': len(self), 'hashd': self._hashd,
404                'relations': {self.lindex[i].name:
405                              {self.lindex[j].name: Cutil.dist(
406                                  self.lindex[i].keys, self.lindex[j].keys, distr)
407                               for j in range(i+1, len(self.lindex))}
408                              for i in range(len(self.lindex)-1)}
409                }

return a dict with data used in AnaDataset module

Parameters

  • distr : Boolean (default False) - If True, add distr information
def reindex(self):
411    def reindex(self):
412        '''Calculate a new default codec for each index (Return self)'''
413        for idx in self.lindex:
414            idx.reindex()
415        return self

Calculate a new default codec for each index (Return self)

def delindex(self, delname=None, savename=None):
417    def delindex(self, delname=None, savename=None):
418        '''remove an Field or a list of Field.
419
420        *Parameters*
421
422        - **delname** : string or list of string - name of index to remove
423        - **savename** : string or list of string - name of index to keep
424
425        *Returns* : none '''
426        if not delname and not savename:
427            return
428        if isinstance(delname, str):
429            delname = [delname]
430        if isinstance(savename, str):
431            savename = [savename]
432        if delname and savename:
433            delname = [name for name in delname if not name in savename]
434        if not delname:
435            delname = [name for name in self.lname if not name in savename]
436        for idxname in delname:
437            if idxname in self.lname:
438                self.lindex.pop(self.lname.index(idxname))

remove an Field or a list of Field.

Parameters

  • delname : string or list of string - name of index to remove
  • savename : string or list of string - name of index to keep

Returns : none

def nindex(self, name):
440    def nindex(self, name):
441        ''' index with name equal to attribute name'''
442        if name in self.lname:
443            return self.lindex[self.lname.index(name)]
444        return None

index with name equal to attribute name

def renameindex(self, oldname, newname):
446    def renameindex(self, oldname, newname):
447        '''replace an index name 'oldname' by a new one 'newname'. '''
448        for i in range(self.lenindex):
449            if self.lname[i] == oldname:
450                self.lindex[i].setname(newname)
451        for i in range(len(self.lvarname)):
452            if self.lvarname[i] == oldname:
453                self.lvarname[i] = newname

replace an index name 'oldname' by a new one 'newname'.

def reorder(self, recorder=None):
455    def reorder(self, recorder=None):
456        '''Reorder records in the order define by 'recorder' '''
457        if recorder is None or set(recorder) != set(range(len(self))):
458            return None
459        for idx in self.lindex:
460            idx.set_keys([idx.keys[i] for i in recorder])
461        return None

Reorder records in the order define by 'recorder'

def setname(self, listname=None):
463    def setname(self, listname=None):
464        '''Update Field name by the name in listname'''
465        for i in range(min(self.lenindex, len(listname))):
466            self.lindex[i].name = listname[i]

Update Field name by the name in listname

def swapindex(self, order):
468    def swapindex(self, order):
469        '''
470        Change the order of the index .
471
472        *Parameters*
473
474        - **order** : list of int or list of name - new order of index to apply.
475
476        *Returns* : self '''
477        if self.lenindex != len(order):
478            raise DatasetError('length of order and Dataset different')
479        if not order or isinstance(order[0], int):
480            self.lindex = [self.lindex[ind] for ind in order]
481        elif isinstance(order[0], str):
482            self.lindex = [self.nindex(name) for name in order]
483        return self

Change the order of the index .

Parameters

  • order : list of int or list of name - new order of index to apply.

Returns : self

def check_relation(self, field, parent, typecoupl, value=True):
485    def check_relation(self, field, parent, typecoupl, value=True):
486        '''get the inconsistent records for a relationship
487
488         *Parameters*
489
490        - **field** : int or str - index or name of the field involved in the relation
491        - **parent**: int or str - index or name of the second field involved in the relation
492        - **typecoupl**: str - relationship to check ('derived' or 'coupled')
493        - **value**: boolean (default True) - if True return a dict with inconsistent
494        values of the fields, else a tuple with index of records)
495
496        *Returns* :
497
498        - dict with inconsistent values of the fields
499        - or a tuple with index of records'''
500        f_parent = copy(self.nindex(parent) if isinstance(parent, str)
501                                            else self.lindex[parent])
502        f_field = copy(self.nindex(field) if isinstance(field, str)
503                                          else self.lindex[field])
504        match typecoupl:
505            case 'derived':
506                errors = f_parent.coupling(f_field, reindex=True)
507            case 'coupled':
508                errors = copy(f_parent).coupling(
509                    f_field, derived=False, reindex=True)
510            case _:
511                raise DatasetError(typecoupl + "is not a valid relationship")
512        if not value:
513            return errors
514        return {'row': list(errors), f_field.name: f_field[errors], f_parent.name: f_parent[errors]}

get the inconsistent records for a relationship

Parameters

  • field : int or str - index or name of the field involved in the relation
  • parent: int or str - index or name of the second field involved in the relation
  • typecoupl: str - relationship to check ('derived' or 'coupled')
  • value: boolean (default True) - if True return a dict with inconsistent values of the fields, else a tuple with index of records)

Returns :

  • dict with inconsistent values of the fields
  • or a tuple with index of records
def check_relationship(self, relations):
516    def check_relationship(self, relations):
517        '''get the inconsistent records for each relationship defined in relations
518
519         *Parameters*
520
521        - **relations** : list of dict or single dict - list of fields with relationship property
522
523        *Returns* :
524
525        - dict with for each relationship: key = string with the two fields name,
526        and value = list of inconsistent records
527        - or if single relationship : value'''
528        if not isinstance(relations, (list, dict)):
529            raise DatasetError("relations is not correct")
530        if isinstance(relations, dict):
531            relations = [relations]
532        dic_res = {}
533        for field in relations:
534            if not 'relationship' in field or not 'name' in field:
535                continue
536            if not 'parent' in field['relationship'] or not 'link' in field['relationship']:
537                raise DatasetError("relationship is not correct")
538            rel = field['relationship']['link']
539            f_parent = field['relationship']['parent']
540            f_field = field['name']
541            name_rel = f_field + ' - ' + f_parent
542            if self.nindex(f_parent) is None or self.nindex(f_field) is None:
543                raise DatasetError("field's name is not present in data")
544            dic_res[name_rel] = self.check_relation(
545                f_field, f_parent, rel, False)
546        if len(dic_res) == 1:
547            return list(dic_res.values())[0]
548        return dic_res

get the inconsistent records for each relationship defined in relations

Parameters

  • relations : list of dict or single dict - list of fields with relationship property

Returns :

  • dict with for each relationship: key = string with the two fields name, and value = list of inconsistent records
  • or if single relationship : value
class DatasetError(builtins.Exception):
551class DatasetError(Exception):
552    # %% errors
553    ''' Dataset Exception'''
554    # pass

Dataset Exception

Inherited Members
builtins.Exception
Exception
builtins.BaseException
with_traceback