python.observation.dataset_analysis

Created on Sun Oct 2 22:24:59 2022

@author: philippe@loco-labs.io

The python.observation.dataset_analysis module contains the Analysis class.

  1# -*- coding: utf-8 -*-
  2"""
  3Created on Sun Oct  2 22:24:59 2022
  4
  5@author: philippe@loco-labs.io
  6
  7The `python.observation.dataset_analysis` module contains the `Analysis` class.
  8
  9"""
 10
 11# %% declarations
 12from copy import copy
 13import pprint
 14from collections import Counter
 15
 16from observation.util import util
 17
 18
 19class Analysis:
 20    '''This class analyses relationships included in a tabular object 
 21    (Pandas DataFrame, Dataset, Observation, list of list).
 22
 23    The Analysis class includes the following functions:
 24    - identification and qualification of the relationships between Field,
 25    - generation of the global properties of the structure
 26    - data actualization based on structure updates
 27
 28    *Attributes* :
 29
 30    - **iobj** : Dataset or Observation associated to the Analysis object
 31    - **hashi** : internal Id of the iobj
 32    - **matrix** : square matrix with relationship properties between two fields
 33    - **infos** : list of characteristics (matrix synthesis)
 34    - **primary** : list of 'primary' fields row 
 35    - **secondary** : list of 'secondary' fields row 
 36    - **lvarname** : list of 'variable' fields name 
 37
 38    The methods defined in this class are :
 39
 40    - `Analysis.actualize`
 41    - `Analysis.actualize`
 42    - `Analysis.check_relationship`
 43    - `Analysis.getmatrix`
 44    - `Analysis.getvarname`
 45    - `Analysis.getsecondary`
 46    - `Analysis.getprimary`
 47    - `Analysis.getpartition`
 48    - `Analysis.tree`
 49    '''
 50    # %% methods
 51
 52    def __init__(self, iobj):
 53        '''Analysis constructor.
 54
 55         *Parameters*
 56
 57        - **iobj** : object - tabular object (Pandas DataFrame, Dataset, Observation, 
 58        list of list)
 59
 60        Note: The Analysis data can be update only if tabular object is Dataset or 
 61        Observation.
 62        '''
 63        if iobj.__class__.__name__ in ('Dataset', 'Observation', 'Ndataset', 'Sdataset'):
 64            self.iobj = iobj
 65        elif iobj.__class__.__name__ == 'DataFrame':
 66            from observation import Sdataset
 67            self.iobj = Sdataset(iobj)
 68        else:
 69            from dataset import Dataset
 70            self.iobj = Dataset.obj(iobj)
 71        self.hashi = None
 72        self.matrix = None
 73        self.infos = None
 74        self.primary = None
 75        self.secondary = None
 76        self.lvarname = None
 77        self.partition = []
 78        self.groups = []
 79
 80    def actualize(self, partition=None):
 81        ''' update all data with new values of iobj
 82
 83         *Parameters*
 84
 85        - **partition** : list of int (default None) - partition to be used '''
 86        self.matrix = self._setmatrix()
 87        self._setinfos()
 88        self._setparent()
 89        self._setgroups()
 90        self._setpartition()
 91        self._setinfospartition(partition)
 92        self.hashi = self.iobj._hashi()
 93        self.lvarname = [idx['name']
 94                         for idx in self.infos if idx['cat'] == 'variable']
 95        coupledvar = [idx['name'] for idx in self.infos if idx['cat'] == 'coupled'
 96                      and self.infos[idx['parent']]['cat'] == 'variable']
 97        self.lvarname += coupledvar
 98        self.secondary = [idx['num']
 99                          for idx in self.infos if idx['cat'] == 'secondary']
100        coupledsec = [idx['num'] for idx in self.infos if idx['cat'] == 'coupled'
101                      and self.infos[idx['parent']]['cat'] in ('primary', 'secondary')]
102        self.secondary += coupledsec
103        #infosidx = [idx for idx in self.infos if idx['cat'] != 'variable']
104        infosidx = [idx for idx in self.infos if idx['cat'] != 'variable' and 
105                    not (idx['cat'] == 'coupled' and 
106                         self.infos[idx['parent']]['cat'] == 'variable') ]
107        self.primary = [infosidx.index(idx)
108                        for idx in infosidx if idx['cat'] == 'primary']
109
110    def check_relationship(self, relations):
111        '''get the list of inconsistent records for each relationship defined in relations
112
113         *Parameters*
114
115        - **relations** : list of dict - list of fields with relationship property
116        
117        *Returns* : dict with for each relationship: key = pair of name, 
118        and value = list of inconsistent records'''
119        if not isinstance(relations, (list, dict)):
120            raise AnalysisError("relations is not correct")
121        if isinstance(relations, dict):
122            relations = [relations]
123        dic_res = {}
124        for field in relations:
125            if not 'relationship' in field or not 'name' in field:
126                continue
127            if not 'parent' in field['relationship'] or not 'link' in field['relationship']:
128                raise AnalysisError("relationship is not correct")
129            rel = field['relationship']['link']
130            f_parent = self.iobj.nindex(field['relationship']['parent'])
131            f_field = self.iobj.nindex(field['name'])
132            name_rel = field['name'] + ' - ' + field['relationship']['parent']
133            if f_parent is None or f_field is None:
134                raise AnalysisError("field's name are  not present in data")
135            match rel:
136                case 'derived':
137                    dic_res[name_rel] = f_parent.coupling(f_field, reindex=True)                
138                case 'coupled':
139                    dic_res[name_rel] = f_parent.coupling(f_field, derived=False, reindex=True)    
140                case _:
141                    raise AnalysisError(rel + "is not a valid relationship")
142        return dic_res          
143    
144    def getinfos(self, keys=None):
145        '''return attribute infos
146
147         *Parameters*
148
149        - **keys** : string, list or tuple (default None) - list of attributes to returned
150        if 'all' or None, all attributes are returned
151        if 'struct', only structural attributes are returned'''
152        if self.hashi != self.iobj._hashi():
153            self.actualize()
154        if keys == 'struct':
155            keys = ['num', 'name', 'cat', 'child', 'crossed', 'distparent',
156                    'diffdistparent', 'parent', 'pparent', 'rateder', 'ratecpl']
157        if not keys or keys == 'all':
158            return self.infos
159        return [{k: v for k, v in inf.items() if k in keys} for inf in self.infos]
160
161    def getmatrix(self, name=None):
162        '''return attribute matrix or only one value of the matrix defined by two names
163
164         *Parameters*
165
166        - **name** : list or tuple (default None) - list of two fields names        
167        '''
168        if self.hashi != self.iobj._hashi():
169            self.actualize()
170        if not name or not isinstance(name, list):
171            return self.matrix
172        if name[0] in self.iobj.lname:
173            ind0 = self.iobj.lname.index(name[0])
174            if len(name) == 1:
175                return self.matrix[ind0]
176            if len(name) > 1 and name[1] in self.iobj.lname:
177                return self.matrix[ind0][self.iobj.lname.index(name[1])]
178        return None
179
180    def getvarname(self):
181        '''return variable Field name'''
182        if self.hashi != self.iobj._hashi():
183            self.actualize()
184        return self.lvarname
185
186    def getprimary(self):
187        '''return attribute primary'''
188        if self.hashi != self.iobj._hashi():
189            self.actualize()
190        return self.primary
191
192    def getsecondary(self):
193        '''return attribute secondary'''
194        if self.hashi != self.iobj._hashi():
195            self.actualize()
196        return self.secondary
197
198    def getpartition(self):
199        '''return attribute partition'''
200        if self.hashi != self.iobj._hashi():
201            self.actualize()
202        return self.partition
203
204    def getgroups(self):
205        '''return attribute groups'''
206        if self.hashi != self.iobj._hashi():
207            self.actualize()
208        return self.groups
209
210    def tree(self, mode='derived', width=5, lname=20, string=True):
211        '''return a string with a tree of derived Field.
212
213         *Parameters*
214
215        - **lname** : integer (default 20) - length of the names        
216        - **width** : integer (default 5) - length of the lines        
217        - **mode** : string (default 'derived') - kind of tree :
218            'derived' : derived tree
219            'distance': min distance tree
220            'diff': min dist rate tree
221        '''
222        if mode == 'derived':
223            modeparent = 'parent'
224        elif mode == 'distance':
225            modeparent = 'minparent'
226        elif mode == 'diff':
227            modeparent = 'distparent'
228        else:
229            raise AnalysisError('mode is unknown')
230        if self.hashi != self.iobj._hashi():
231            self.actualize()
232        child = [None] * (len(self.infos) + 1)
233        for i in range(len(self.infos)):
234            parent = self.infos[i][modeparent]
235            if child[parent + 1] is None:
236                child[parent + 1] = []
237            child[parent + 1].append(i)
238        tr = self._dic_noeud(-1, child, lname, mode)
239        if string:
240            tre = pprint.pformat(tr, indent=0, width=width)
241            tre = tre.replace('---', ' - ')
242            tre = tre.replace('  ', ' ')
243            tre = tre.replace('*', ' ')
244            for c in ["'", "\"", "{", "[", "]", "}", ","]:
245                tre = tre.replace(c, "")
246            return tre
247        return tr
248
249    # %% internal methods
250    def _setmatrix(self):
251        '''set and return matrix attributes (coupling infos between each idx)'''
252        lenindex = self.iobj.lenindex
253        mat = [[None for i in range(lenindex)] for i in range(lenindex)]
254        for i in range(lenindex):
255            for j in range(i, lenindex):
256                mat[i][j] = self.iobj.lindex[i].couplinginfos(
257                    self.iobj.lindex[j])
258            for j in range(i):
259                mat[i][j] = copy(mat[j][i])
260                if mat[i][j]['typecoupl'] == 'derived':
261                    mat[i][j]['typecoupl'] = 'derive'
262                elif mat[i][j]['typecoupl'] == 'derive':
263                    mat[i][j]['typecoupl'] = 'derived'
264                elif mat[i][j]['typecoupl'] == 'linked':
265                    mat[i][j]['typecoupl'] = 'link'
266                elif mat[i][j]['typecoupl'] == 'link':
267                    mat[i][j]['typecoupl'] = 'linked'
268        return mat
269
270    def _setinfos(self):
271        '''set and return attribute 'infos'. 
272        Infos is an array with infos of each index :
273            - num, name, cat, child, crossed, distparent, diffdistparent, 
274            parent, pparent, rateder'''
275        lenindex = self.iobj.lenindex
276        leniobj = len(self.iobj)
277        self.infos = [{} for i in range(lenindex)]
278        for i in range(lenindex):
279            self.infos[i]['num'] = i
280            self.infos[i]['name'] = self.iobj.lname[i]
281            self.infos[i]['cat'] = 'null'
282            self.infos[i]['parent'] = -1
283            self.infos[i]['distparent'] = -1
284            self.infos[i]['minparent'] = -1
285            self.infos[i]['pparent'] = -2
286            self.infos[i]['diffdistparent'] = -1
287            self.infos[i]['distance'] = leniobj * leniobj
288            self.infos[i]['ratecpl'] = 1
289            self.infos[i]['rateder'] = 1
290            self.infos[i]['child'] = []
291            self.infos[i]['crossed'] = []
292            self.infos[i] |= self.iobj.lindex[i].infos
293            if self.infos[i]['typecodec'] == 'unique':
294                self.infos[i]['pparent'] = -1
295                self.infos[i]['cat'] = 'unique'
296                self.infos[i]['diffdistparent'] = leniobj - 1
297                self.infos[i]['rateder'] = 0
298        for i in range(lenindex):
299            for j in range(i+1, lenindex):
300                if self.matrix[i][j]['typecoupl'] == 'coupled' and \
301                        self.infos[j]['parent'] == -1:
302                    self.infos[j]['parent'] = i
303                    self.infos[j]['distparent'] = i
304                    self.infos[j]['diffdistparent'] = 0
305                    self.infos[j]['rateder'] = 0
306                    self.infos[j]['cat'] = 'coupled'
307                    self.infos[i]['child'].append(j)
308        return
309
310    def _setinfospartition(self, partition=None):
311        '''add partition data into infos attribute'''
312        if not partition is None and not partition in self.partition:
313            raise AnalysisError('partition is not a valid partition')
314        lenindex = self.iobj.lenindex
315        infosp = self.infos
316        if not partition and len(self.partition) > 0:
317            partition = self.partition[0]
318        if partition:
319            for i in partition:
320                infosp[i]['cat'] = 'primary'
321                infosp[i]['pparent'] = i
322        for i in range(lenindex):
323            if infosp[i]['cat'] == 'null':
324                util.pparent2(i, infosp)
325                if infosp[i]['pparent'] == -1 and partition:
326                    infosp[i]['cat'] = 'variable'
327                else:
328                    infosp[i]['cat'] = 'secondary'
329        for i in range(lenindex):
330            if infosp[i]['cat'] == 'coupled':
331                infosp[i]['pparent'] = infosp[infosp[i]['parent']]['pparent']
332
333    def _setparent(self):
334        '''set parent (Field with minimal diff) for each Field'''
335        # parent : min(diff) -> child
336        # distparent : min(rateder) -> diffdistparent, rateder(rateA)
337        # minparent : min(distance) -> rate(rateB), distance
338        lenindex = self.iobj.lenindex
339        leniobj = len(self.iobj)
340        for i in range(lenindex):
341            mindiff = leniobj
342            ratedermin = 1
343            distancemin = leniobj * leniobj
344            distparent = None
345            minparent = None
346            parent = None
347            infoi = self.infos[i]
348            for j in range(lenindex):
349                matij = self.matrix[i][j]
350                if not infoi['cat'] in ['unique', 'coupled']:
351                    if i != j and not i in self._listparent(j, 'parent') and \
352                            matij['typecoupl'] in ('coupled', 'derived') and \
353                            matij['diff'] < mindiff:
354                        mindiff = matij['diff']
355                        parent = j
356                    elif i != j and matij['typecoupl'] == 'crossed' and \
357                            self.infos[j]['cat'] != 'coupled':
358                        infoi['crossed'].append(j)
359                    if i != j and not i in self._listparent(j, 'distparent') and \
360                        matij['typecoupl'] in ('coupled', 'derived', 'linked', 'crossed') and \
361                       matij['rateder'] < ratedermin:
362                        ratedermin = matij['rateder']
363                        distparent = j
364                if i != j and not i in self._listparent(j, 'minparent') and \
365                    matij['distance'] < distancemin and \
366                    infoi['lencodec'] <= self.infos[j]['lencodec'] and \
367                        self.infos[j]['cat'] != 'coupled':
368                    distancemin = matij['distance']
369                    minparent = j
370            if not infoi['cat'] in ['unique', 'coupled']:
371                if not parent is None:
372                    infoi['parent'] = parent
373                    self.infos[parent]['child'].append(i)
374                if not distparent is None:
375                    infoi['distparent'] = distparent
376                    infoi['diffdistparent'] = self.matrix[i][distparent]['diff']
377                    infoi['rateder'] = self.matrix[i][distparent]['rateder']
378            if not minparent is None:
379                infoi['minparent'] = minparent
380                infoi['distance'] = self.matrix[i][minparent]['distance']
381                infoi['ratecpl'] = self.matrix[i][minparent]['ratecpl']
382            else:
383                infoi['distance'] = leniobj - infoi['lencodec']
384        return
385
386    def _listparent(self, idx, typeparent):
387        parent = idx
388        listparent = []
389        while not parent is None and parent >= 0:
390            parent = self.infos[parent][typeparent]
391            if not parent is None and parent >= 0:
392                listparent.append(parent)
393        return listparent
394
395    def _dic_noeud(self, n, child, lname, mode):
396        '''generate a dict with nodes data defined by 'child' '''
397        if n == -1:
398            lis = ['root-' + mode + '*(' + str(len(self.iobj)) + ')']
399        else:
400            adding = ''
401            if mode == 'distance':
402                adding = str(self.infos[n]['distance']) + ' - '
403            elif mode == 'diff':
404                adding = str(format(self.infos[n]['rateder'], '.2e')) + ' - '
405            adding += str(self.infos[n]['lencodec'])
406            name = self.infos[n]['name'] + ' (' + adding + ')'
407            lis = [name.replace(' ', '*').replace("'", '*')]
408        if child[n+1]:
409            for ch in child[n+1]:
410                if ch != n:
411                    lis.append(self._dic_noeud(ch, child, lname, mode))
412        return {str(n).ljust(2, '*'): lis}
413
414    def _setgroups(self):
415        '''set groups (list of crossed Field groups)'''
416        self.groups = []
417        crossed = {info['num'] for info in self.infos if info['crossed']}
418        remove = set()
419        for num in crossed:
420            for num2 in crossed:
421                if num != num2 and self.infos[num]['parent'] in crossed:
422                    remove.add(num)
423        crossed -= remove     
424        setcrossed = set()
425        for num in crossed:
426            info = self.infos[num]
427            if not info['name'] in setcrossed:
428                setname = {self.infos[cros]['name'] for cros in info['crossed']
429                           if cros in crossed} | {info['name']}
430                self.groups.append(setname)
431                setcrossed |= setname
432        return None
433
434    def _setpartition(self):
435        '''set partition (list of Field partitions)'''
436        brother = {idx['num']: idx['crossed']
437                   for idx in self.infos if idx['crossed']}
438        self.partition = []
439        chemin = []
440        for cros in brother:
441            chemin = []
442            self._addchemin(chemin, cros, 1, brother)
443        childroot = [idx['num'] for idx in self.infos if idx['parent'] == -1
444                     and idx['typecodec'] in ('complete', 'full')]
445        if childroot:
446            self.partition.append(childroot)
447        return None
448
449    def _addchemin(self, chemin, node, lchemin, brother):
450        '''extend 'chemin' with new nodes and add it to 'partition' '''
451        if lchemin == len(self.iobj) and node == chemin[0] and \
452                max(Counter(zip(*[self.iobj.lindex[idx].keys for idx in chemin])).values()) == 1:
453            part = sorted(chemin)
454            if not part in self.partition:
455                if not self.partition or len(part) > len(self.partition[0]):
456                    self.partition.insert(0, part)
457                else:
458                    self.partition.append(part)
459        if node in chemin[1:]:
460            return
461        lnode = self.infos[node]['lencodec']
462        if lchemin * lnode <= len(self.iobj):
463            newchemin = chemin + [node]
464            for broth in brother[node]:
465                self._addchemin(newchemin, broth, lchemin * lnode, brother)
466
467
468class AnalysisError(Exception):
469    ''' Analysis Exception'''
470    # pass
class Analysis:
 20class Analysis:
 21    '''This class analyses relationships included in a tabular object 
 22    (Pandas DataFrame, Dataset, Observation, list of list).
 23
 24    The Analysis class includes the following functions:
 25    - identification and qualification of the relationships between Field,
 26    - generation of the global properties of the structure
 27    - data actualization based on structure updates
 28
 29    *Attributes* :
 30
 31    - **iobj** : Dataset or Observation associated to the Analysis object
 32    - **hashi** : internal Id of the iobj
 33    - **matrix** : square matrix with relationship properties between two fields
 34    - **infos** : list of characteristics (matrix synthesis)
 35    - **primary** : list of 'primary' fields row 
 36    - **secondary** : list of 'secondary' fields row 
 37    - **lvarname** : list of 'variable' fields name 
 38
 39    The methods defined in this class are :
 40
 41    - `Analysis.actualize`
 42    - `Analysis.actualize`
 43    - `Analysis.check_relationship`
 44    - `Analysis.getmatrix`
 45    - `Analysis.getvarname`
 46    - `Analysis.getsecondary`
 47    - `Analysis.getprimary`
 48    - `Analysis.getpartition`
 49    - `Analysis.tree`
 50    '''
 51    # %% methods
 52
 53    def __init__(self, iobj):
 54        '''Analysis constructor.
 55
 56         *Parameters*
 57
 58        - **iobj** : object - tabular object (Pandas DataFrame, Dataset, Observation, 
 59        list of list)
 60
 61        Note: The Analysis data can be update only if tabular object is Dataset or 
 62        Observation.
 63        '''
 64        if iobj.__class__.__name__ in ('Dataset', 'Observation', 'Ndataset', 'Sdataset'):
 65            self.iobj = iobj
 66        elif iobj.__class__.__name__ == 'DataFrame':
 67            from observation import Sdataset
 68            self.iobj = Sdataset(iobj)
 69        else:
 70            from dataset import Dataset
 71            self.iobj = Dataset.obj(iobj)
 72        self.hashi = None
 73        self.matrix = None
 74        self.infos = None
 75        self.primary = None
 76        self.secondary = None
 77        self.lvarname = None
 78        self.partition = []
 79        self.groups = []
 80
 81    def actualize(self, partition=None):
 82        ''' update all data with new values of iobj
 83
 84         *Parameters*
 85
 86        - **partition** : list of int (default None) - partition to be used '''
 87        self.matrix = self._setmatrix()
 88        self._setinfos()
 89        self._setparent()
 90        self._setgroups()
 91        self._setpartition()
 92        self._setinfospartition(partition)
 93        self.hashi = self.iobj._hashi()
 94        self.lvarname = [idx['name']
 95                         for idx in self.infos if idx['cat'] == 'variable']
 96        coupledvar = [idx['name'] for idx in self.infos if idx['cat'] == 'coupled'
 97                      and self.infos[idx['parent']]['cat'] == 'variable']
 98        self.lvarname += coupledvar
 99        self.secondary = [idx['num']
100                          for idx in self.infos if idx['cat'] == 'secondary']
101        coupledsec = [idx['num'] for idx in self.infos if idx['cat'] == 'coupled'
102                      and self.infos[idx['parent']]['cat'] in ('primary', 'secondary')]
103        self.secondary += coupledsec
104        #infosidx = [idx for idx in self.infos if idx['cat'] != 'variable']
105        infosidx = [idx for idx in self.infos if idx['cat'] != 'variable' and 
106                    not (idx['cat'] == 'coupled' and 
107                         self.infos[idx['parent']]['cat'] == 'variable') ]
108        self.primary = [infosidx.index(idx)
109                        for idx in infosidx if idx['cat'] == 'primary']
110
111    def check_relationship(self, relations):
112        '''get the list of inconsistent records for each relationship defined in relations
113
114         *Parameters*
115
116        - **relations** : list of dict - list of fields with relationship property
117        
118        *Returns* : dict with for each relationship: key = pair of name, 
119        and value = list of inconsistent records'''
120        if not isinstance(relations, (list, dict)):
121            raise AnalysisError("relations is not correct")
122        if isinstance(relations, dict):
123            relations = [relations]
124        dic_res = {}
125        for field in relations:
126            if not 'relationship' in field or not 'name' in field:
127                continue
128            if not 'parent' in field['relationship'] or not 'link' in field['relationship']:
129                raise AnalysisError("relationship is not correct")
130            rel = field['relationship']['link']
131            f_parent = self.iobj.nindex(field['relationship']['parent'])
132            f_field = self.iobj.nindex(field['name'])
133            name_rel = field['name'] + ' - ' + field['relationship']['parent']
134            if f_parent is None or f_field is None:
135                raise AnalysisError("field's name are  not present in data")
136            match rel:
137                case 'derived':
138                    dic_res[name_rel] = f_parent.coupling(f_field, reindex=True)                
139                case 'coupled':
140                    dic_res[name_rel] = f_parent.coupling(f_field, derived=False, reindex=True)    
141                case _:
142                    raise AnalysisError(rel + "is not a valid relationship")
143        return dic_res          
144    
145    def getinfos(self, keys=None):
146        '''return attribute infos
147
148         *Parameters*
149
150        - **keys** : string, list or tuple (default None) - list of attributes to returned
151        if 'all' or None, all attributes are returned
152        if 'struct', only structural attributes are returned'''
153        if self.hashi != self.iobj._hashi():
154            self.actualize()
155        if keys == 'struct':
156            keys = ['num', 'name', 'cat', 'child', 'crossed', 'distparent',
157                    'diffdistparent', 'parent', 'pparent', 'rateder', 'ratecpl']
158        if not keys or keys == 'all':
159            return self.infos
160        return [{k: v for k, v in inf.items() if k in keys} for inf in self.infos]
161
162    def getmatrix(self, name=None):
163        '''return attribute matrix or only one value of the matrix defined by two names
164
165         *Parameters*
166
167        - **name** : list or tuple (default None) - list of two fields names        
168        '''
169        if self.hashi != self.iobj._hashi():
170            self.actualize()
171        if not name or not isinstance(name, list):
172            return self.matrix
173        if name[0] in self.iobj.lname:
174            ind0 = self.iobj.lname.index(name[0])
175            if len(name) == 1:
176                return self.matrix[ind0]
177            if len(name) > 1 and name[1] in self.iobj.lname:
178                return self.matrix[ind0][self.iobj.lname.index(name[1])]
179        return None
180
181    def getvarname(self):
182        '''return variable Field name'''
183        if self.hashi != self.iobj._hashi():
184            self.actualize()
185        return self.lvarname
186
187    def getprimary(self):
188        '''return attribute primary'''
189        if self.hashi != self.iobj._hashi():
190            self.actualize()
191        return self.primary
192
193    def getsecondary(self):
194        '''return attribute secondary'''
195        if self.hashi != self.iobj._hashi():
196            self.actualize()
197        return self.secondary
198
199    def getpartition(self):
200        '''return attribute partition'''
201        if self.hashi != self.iobj._hashi():
202            self.actualize()
203        return self.partition
204
205    def getgroups(self):
206        '''return attribute groups'''
207        if self.hashi != self.iobj._hashi():
208            self.actualize()
209        return self.groups
210
211    def tree(self, mode='derived', width=5, lname=20, string=True):
212        '''return a string with a tree of derived Field.
213
214         *Parameters*
215
216        - **lname** : integer (default 20) - length of the names        
217        - **width** : integer (default 5) - length of the lines        
218        - **mode** : string (default 'derived') - kind of tree :
219            'derived' : derived tree
220            'distance': min distance tree
221            'diff': min dist rate tree
222        '''
223        if mode == 'derived':
224            modeparent = 'parent'
225        elif mode == 'distance':
226            modeparent = 'minparent'
227        elif mode == 'diff':
228            modeparent = 'distparent'
229        else:
230            raise AnalysisError('mode is unknown')
231        if self.hashi != self.iobj._hashi():
232            self.actualize()
233        child = [None] * (len(self.infos) + 1)
234        for i in range(len(self.infos)):
235            parent = self.infos[i][modeparent]
236            if child[parent + 1] is None:
237                child[parent + 1] = []
238            child[parent + 1].append(i)
239        tr = self._dic_noeud(-1, child, lname, mode)
240        if string:
241            tre = pprint.pformat(tr, indent=0, width=width)
242            tre = tre.replace('---', ' - ')
243            tre = tre.replace('  ', ' ')
244            tre = tre.replace('*', ' ')
245            for c in ["'", "\"", "{", "[", "]", "}", ","]:
246                tre = tre.replace(c, "")
247            return tre
248        return tr
249
250    # %% internal methods
251    def _setmatrix(self):
252        '''set and return matrix attributes (coupling infos between each idx)'''
253        lenindex = self.iobj.lenindex
254        mat = [[None for i in range(lenindex)] for i in range(lenindex)]
255        for i in range(lenindex):
256            for j in range(i, lenindex):
257                mat[i][j] = self.iobj.lindex[i].couplinginfos(
258                    self.iobj.lindex[j])
259            for j in range(i):
260                mat[i][j] = copy(mat[j][i])
261                if mat[i][j]['typecoupl'] == 'derived':
262                    mat[i][j]['typecoupl'] = 'derive'
263                elif mat[i][j]['typecoupl'] == 'derive':
264                    mat[i][j]['typecoupl'] = 'derived'
265                elif mat[i][j]['typecoupl'] == 'linked':
266                    mat[i][j]['typecoupl'] = 'link'
267                elif mat[i][j]['typecoupl'] == 'link':
268                    mat[i][j]['typecoupl'] = 'linked'
269        return mat
270
271    def _setinfos(self):
272        '''set and return attribute 'infos'. 
273        Infos is an array with infos of each index :
274            - num, name, cat, child, crossed, distparent, diffdistparent, 
275            parent, pparent, rateder'''
276        lenindex = self.iobj.lenindex
277        leniobj = len(self.iobj)
278        self.infos = [{} for i in range(lenindex)]
279        for i in range(lenindex):
280            self.infos[i]['num'] = i
281            self.infos[i]['name'] = self.iobj.lname[i]
282            self.infos[i]['cat'] = 'null'
283            self.infos[i]['parent'] = -1
284            self.infos[i]['distparent'] = -1
285            self.infos[i]['minparent'] = -1
286            self.infos[i]['pparent'] = -2
287            self.infos[i]['diffdistparent'] = -1
288            self.infos[i]['distance'] = leniobj * leniobj
289            self.infos[i]['ratecpl'] = 1
290            self.infos[i]['rateder'] = 1
291            self.infos[i]['child'] = []
292            self.infos[i]['crossed'] = []
293            self.infos[i] |= self.iobj.lindex[i].infos
294            if self.infos[i]['typecodec'] == 'unique':
295                self.infos[i]['pparent'] = -1
296                self.infos[i]['cat'] = 'unique'
297                self.infos[i]['diffdistparent'] = leniobj - 1
298                self.infos[i]['rateder'] = 0
299        for i in range(lenindex):
300            for j in range(i+1, lenindex):
301                if self.matrix[i][j]['typecoupl'] == 'coupled' and \
302                        self.infos[j]['parent'] == -1:
303                    self.infos[j]['parent'] = i
304                    self.infos[j]['distparent'] = i
305                    self.infos[j]['diffdistparent'] = 0
306                    self.infos[j]['rateder'] = 0
307                    self.infos[j]['cat'] = 'coupled'
308                    self.infos[i]['child'].append(j)
309        return
310
311    def _setinfospartition(self, partition=None):
312        '''add partition data into infos attribute'''
313        if not partition is None and not partition in self.partition:
314            raise AnalysisError('partition is not a valid partition')
315        lenindex = self.iobj.lenindex
316        infosp = self.infos
317        if not partition and len(self.partition) > 0:
318            partition = self.partition[0]
319        if partition:
320            for i in partition:
321                infosp[i]['cat'] = 'primary'
322                infosp[i]['pparent'] = i
323        for i in range(lenindex):
324            if infosp[i]['cat'] == 'null':
325                util.pparent2(i, infosp)
326                if infosp[i]['pparent'] == -1 and partition:
327                    infosp[i]['cat'] = 'variable'
328                else:
329                    infosp[i]['cat'] = 'secondary'
330        for i in range(lenindex):
331            if infosp[i]['cat'] == 'coupled':
332                infosp[i]['pparent'] = infosp[infosp[i]['parent']]['pparent']
333
334    def _setparent(self):
335        '''set parent (Field with minimal diff) for each Field'''
336        # parent : min(diff) -> child
337        # distparent : min(rateder) -> diffdistparent, rateder(rateA)
338        # minparent : min(distance) -> rate(rateB), distance
339        lenindex = self.iobj.lenindex
340        leniobj = len(self.iobj)
341        for i in range(lenindex):
342            mindiff = leniobj
343            ratedermin = 1
344            distancemin = leniobj * leniobj
345            distparent = None
346            minparent = None
347            parent = None
348            infoi = self.infos[i]
349            for j in range(lenindex):
350                matij = self.matrix[i][j]
351                if not infoi['cat'] in ['unique', 'coupled']:
352                    if i != j and not i in self._listparent(j, 'parent') and \
353                            matij['typecoupl'] in ('coupled', 'derived') and \
354                            matij['diff'] < mindiff:
355                        mindiff = matij['diff']
356                        parent = j
357                    elif i != j and matij['typecoupl'] == 'crossed' and \
358                            self.infos[j]['cat'] != 'coupled':
359                        infoi['crossed'].append(j)
360                    if i != j and not i in self._listparent(j, 'distparent') and \
361                        matij['typecoupl'] in ('coupled', 'derived', 'linked', 'crossed') and \
362                       matij['rateder'] < ratedermin:
363                        ratedermin = matij['rateder']
364                        distparent = j
365                if i != j and not i in self._listparent(j, 'minparent') and \
366                    matij['distance'] < distancemin and \
367                    infoi['lencodec'] <= self.infos[j]['lencodec'] and \
368                        self.infos[j]['cat'] != 'coupled':
369                    distancemin = matij['distance']
370                    minparent = j
371            if not infoi['cat'] in ['unique', 'coupled']:
372                if not parent is None:
373                    infoi['parent'] = parent
374                    self.infos[parent]['child'].append(i)
375                if not distparent is None:
376                    infoi['distparent'] = distparent
377                    infoi['diffdistparent'] = self.matrix[i][distparent]['diff']
378                    infoi['rateder'] = self.matrix[i][distparent]['rateder']
379            if not minparent is None:
380                infoi['minparent'] = minparent
381                infoi['distance'] = self.matrix[i][minparent]['distance']
382                infoi['ratecpl'] = self.matrix[i][minparent]['ratecpl']
383            else:
384                infoi['distance'] = leniobj - infoi['lencodec']
385        return
386
387    def _listparent(self, idx, typeparent):
388        parent = idx
389        listparent = []
390        while not parent is None and parent >= 0:
391            parent = self.infos[parent][typeparent]
392            if not parent is None and parent >= 0:
393                listparent.append(parent)
394        return listparent
395
396    def _dic_noeud(self, n, child, lname, mode):
397        '''generate a dict with nodes data defined by 'child' '''
398        if n == -1:
399            lis = ['root-' + mode + '*(' + str(len(self.iobj)) + ')']
400        else:
401            adding = ''
402            if mode == 'distance':
403                adding = str(self.infos[n]['distance']) + ' - '
404            elif mode == 'diff':
405                adding = str(format(self.infos[n]['rateder'], '.2e')) + ' - '
406            adding += str(self.infos[n]['lencodec'])
407            name = self.infos[n]['name'] + ' (' + adding + ')'
408            lis = [name.replace(' ', '*').replace("'", '*')]
409        if child[n+1]:
410            for ch in child[n+1]:
411                if ch != n:
412                    lis.append(self._dic_noeud(ch, child, lname, mode))
413        return {str(n).ljust(2, '*'): lis}
414
415    def _setgroups(self):
416        '''set groups (list of crossed Field groups)'''
417        self.groups = []
418        crossed = {info['num'] for info in self.infos if info['crossed']}
419        remove = set()
420        for num in crossed:
421            for num2 in crossed:
422                if num != num2 and self.infos[num]['parent'] in crossed:
423                    remove.add(num)
424        crossed -= remove     
425        setcrossed = set()
426        for num in crossed:
427            info = self.infos[num]
428            if not info['name'] in setcrossed:
429                setname = {self.infos[cros]['name'] for cros in info['crossed']
430                           if cros in crossed} | {info['name']}
431                self.groups.append(setname)
432                setcrossed |= setname
433        return None
434
435    def _setpartition(self):
436        '''set partition (list of Field partitions)'''
437        brother = {idx['num']: idx['crossed']
438                   for idx in self.infos if idx['crossed']}
439        self.partition = []
440        chemin = []
441        for cros in brother:
442            chemin = []
443            self._addchemin(chemin, cros, 1, brother)
444        childroot = [idx['num'] for idx in self.infos if idx['parent'] == -1
445                     and idx['typecodec'] in ('complete', 'full')]
446        if childroot:
447            self.partition.append(childroot)
448        return None
449
450    def _addchemin(self, chemin, node, lchemin, brother):
451        '''extend 'chemin' with new nodes and add it to 'partition' '''
452        if lchemin == len(self.iobj) and node == chemin[0] and \
453                max(Counter(zip(*[self.iobj.lindex[idx].keys for idx in chemin])).values()) == 1:
454            part = sorted(chemin)
455            if not part in self.partition:
456                if not self.partition or len(part) > len(self.partition[0]):
457                    self.partition.insert(0, part)
458                else:
459                    self.partition.append(part)
460        if node in chemin[1:]:
461            return
462        lnode = self.infos[node]['lencodec']
463        if lchemin * lnode <= len(self.iobj):
464            newchemin = chemin + [node]
465            for broth in brother[node]:
466                self._addchemin(newchemin, broth, lchemin * lnode, brother)

This class analyses relationships included in a tabular object (Pandas DataFrame, Dataset, Observation, list of list).

The Analysis class includes the following functions:

  • identification and qualification of the relationships between Field,
  • generation of the global properties of the structure
  • data actualization based on structure updates

Attributes :

  • iobj : Dataset or Observation associated to the Analysis object
  • hashi : internal Id of the iobj
  • matrix : square matrix with relationship properties between two fields
  • infos : list of characteristics (matrix synthesis)
  • primary : list of 'primary' fields row
  • secondary : list of 'secondary' fields row
  • lvarname : list of 'variable' fields name

The methods defined in this class are :

Analysis(iobj)
53    def __init__(self, iobj):
54        '''Analysis constructor.
55
56         *Parameters*
57
58        - **iobj** : object - tabular object (Pandas DataFrame, Dataset, Observation, 
59        list of list)
60
61        Note: The Analysis data can be update only if tabular object is Dataset or 
62        Observation.
63        '''
64        if iobj.__class__.__name__ in ('Dataset', 'Observation', 'Ndataset', 'Sdataset'):
65            self.iobj = iobj
66        elif iobj.__class__.__name__ == 'DataFrame':
67            from observation import Sdataset
68            self.iobj = Sdataset(iobj)
69        else:
70            from dataset import Dataset
71            self.iobj = Dataset.obj(iobj)
72        self.hashi = None
73        self.matrix = None
74        self.infos = None
75        self.primary = None
76        self.secondary = None
77        self.lvarname = None
78        self.partition = []
79        self.groups = []

Analysis constructor.

Parameters

  • iobj : object - tabular object (Pandas DataFrame, Dataset, Observation, list of list)

Note: The Analysis data can be update only if tabular object is Dataset or Observation.

hashi
matrix
infos
primary
secondary
lvarname
partition
groups
def actualize(self, partition=None):
 81    def actualize(self, partition=None):
 82        ''' update all data with new values of iobj
 83
 84         *Parameters*
 85
 86        - **partition** : list of int (default None) - partition to be used '''
 87        self.matrix = self._setmatrix()
 88        self._setinfos()
 89        self._setparent()
 90        self._setgroups()
 91        self._setpartition()
 92        self._setinfospartition(partition)
 93        self.hashi = self.iobj._hashi()
 94        self.lvarname = [idx['name']
 95                         for idx in self.infos if idx['cat'] == 'variable']
 96        coupledvar = [idx['name'] for idx in self.infos if idx['cat'] == 'coupled'
 97                      and self.infos[idx['parent']]['cat'] == 'variable']
 98        self.lvarname += coupledvar
 99        self.secondary = [idx['num']
100                          for idx in self.infos if idx['cat'] == 'secondary']
101        coupledsec = [idx['num'] for idx in self.infos if idx['cat'] == 'coupled'
102                      and self.infos[idx['parent']]['cat'] in ('primary', 'secondary')]
103        self.secondary += coupledsec
104        #infosidx = [idx for idx in self.infos if idx['cat'] != 'variable']
105        infosidx = [idx for idx in self.infos if idx['cat'] != 'variable' and 
106                    not (idx['cat'] == 'coupled' and 
107                         self.infos[idx['parent']]['cat'] == 'variable') ]
108        self.primary = [infosidx.index(idx)
109                        for idx in infosidx if idx['cat'] == 'primary']

update all data with new values of iobj

Parameters

  • partition : list of int (default None) - partition to be used
def check_relationship(self, relations):
111    def check_relationship(self, relations):
112        '''get the list of inconsistent records for each relationship defined in relations
113
114         *Parameters*
115
116        - **relations** : list of dict - list of fields with relationship property
117        
118        *Returns* : dict with for each relationship: key = pair of name, 
119        and value = list of inconsistent records'''
120        if not isinstance(relations, (list, dict)):
121            raise AnalysisError("relations is not correct")
122        if isinstance(relations, dict):
123            relations = [relations]
124        dic_res = {}
125        for field in relations:
126            if not 'relationship' in field or not 'name' in field:
127                continue
128            if not 'parent' in field['relationship'] or not 'link' in field['relationship']:
129                raise AnalysisError("relationship is not correct")
130            rel = field['relationship']['link']
131            f_parent = self.iobj.nindex(field['relationship']['parent'])
132            f_field = self.iobj.nindex(field['name'])
133            name_rel = field['name'] + ' - ' + field['relationship']['parent']
134            if f_parent is None or f_field is None:
135                raise AnalysisError("field's name are  not present in data")
136            match rel:
137                case 'derived':
138                    dic_res[name_rel] = f_parent.coupling(f_field, reindex=True)                
139                case 'coupled':
140                    dic_res[name_rel] = f_parent.coupling(f_field, derived=False, reindex=True)    
141                case _:
142                    raise AnalysisError(rel + "is not a valid relationship")
143        return dic_res          

get the list of inconsistent records for each relationship defined in relations

Parameters

  • relations : list of dict - list of fields with relationship property

Returns : dict with for each relationship: key = pair of name, and value = list of inconsistent records

def getinfos(self, keys=None):
145    def getinfos(self, keys=None):
146        '''return attribute infos
147
148         *Parameters*
149
150        - **keys** : string, list or tuple (default None) - list of attributes to returned
151        if 'all' or None, all attributes are returned
152        if 'struct', only structural attributes are returned'''
153        if self.hashi != self.iobj._hashi():
154            self.actualize()
155        if keys == 'struct':
156            keys = ['num', 'name', 'cat', 'child', 'crossed', 'distparent',
157                    'diffdistparent', 'parent', 'pparent', 'rateder', 'ratecpl']
158        if not keys or keys == 'all':
159            return self.infos
160        return [{k: v for k, v in inf.items() if k in keys} for inf in self.infos]

return attribute infos

Parameters

  • keys : string, list or tuple (default None) - list of attributes to returned if 'all' or None, all attributes are returned if 'struct', only structural attributes are returned
def getmatrix(self, name=None):
162    def getmatrix(self, name=None):
163        '''return attribute matrix or only one value of the matrix defined by two names
164
165         *Parameters*
166
167        - **name** : list or tuple (default None) - list of two fields names        
168        '''
169        if self.hashi != self.iobj._hashi():
170            self.actualize()
171        if not name or not isinstance(name, list):
172            return self.matrix
173        if name[0] in self.iobj.lname:
174            ind0 = self.iobj.lname.index(name[0])
175            if len(name) == 1:
176                return self.matrix[ind0]
177            if len(name) > 1 and name[1] in self.iobj.lname:
178                return self.matrix[ind0][self.iobj.lname.index(name[1])]
179        return None

return attribute matrix or only one value of the matrix defined by two names

Parameters

  • name : list or tuple (default None) - list of two fields names
def getvarname(self):
181    def getvarname(self):
182        '''return variable Field name'''
183        if self.hashi != self.iobj._hashi():
184            self.actualize()
185        return self.lvarname

return variable Field name

def getprimary(self):
187    def getprimary(self):
188        '''return attribute primary'''
189        if self.hashi != self.iobj._hashi():
190            self.actualize()
191        return self.primary

return attribute primary

def getsecondary(self):
193    def getsecondary(self):
194        '''return attribute secondary'''
195        if self.hashi != self.iobj._hashi():
196            self.actualize()
197        return self.secondary

return attribute secondary

def getpartition(self):
199    def getpartition(self):
200        '''return attribute partition'''
201        if self.hashi != self.iobj._hashi():
202            self.actualize()
203        return self.partition

return attribute partition

def getgroups(self):
205    def getgroups(self):
206        '''return attribute groups'''
207        if self.hashi != self.iobj._hashi():
208            self.actualize()
209        return self.groups

return attribute groups

def tree(self, mode='derived', width=5, lname=20, string=True):
211    def tree(self, mode='derived', width=5, lname=20, string=True):
212        '''return a string with a tree of derived Field.
213
214         *Parameters*
215
216        - **lname** : integer (default 20) - length of the names        
217        - **width** : integer (default 5) - length of the lines        
218        - **mode** : string (default 'derived') - kind of tree :
219            'derived' : derived tree
220            'distance': min distance tree
221            'diff': min dist rate tree
222        '''
223        if mode == 'derived':
224            modeparent = 'parent'
225        elif mode == 'distance':
226            modeparent = 'minparent'
227        elif mode == 'diff':
228            modeparent = 'distparent'
229        else:
230            raise AnalysisError('mode is unknown')
231        if self.hashi != self.iobj._hashi():
232            self.actualize()
233        child = [None] * (len(self.infos) + 1)
234        for i in range(len(self.infos)):
235            parent = self.infos[i][modeparent]
236            if child[parent + 1] is None:
237                child[parent + 1] = []
238            child[parent + 1].append(i)
239        tr = self._dic_noeud(-1, child, lname, mode)
240        if string:
241            tre = pprint.pformat(tr, indent=0, width=width)
242            tre = tre.replace('---', ' - ')
243            tre = tre.replace('  ', ' ')
244            tre = tre.replace('*', ' ')
245            for c in ["'", "\"", "{", "[", "]", "}", ","]:
246                tre = tre.replace(c, "")
247            return tre
248        return tr

return a string with a tree of derived Field.

Parameters

  • lname : integer (default 20) - length of the names
  • width : integer (default 5) - length of the lines
  • mode : string (default 'derived') - kind of tree : 'derived' : derived tree 'distance': min distance tree 'diff': min dist rate tree
class AnalysisError(builtins.Exception):
469class AnalysisError(Exception):
470    ''' Analysis Exception'''
471    # pass

Analysis Exception

Inherited Members
builtins.Exception
Exception
builtins.BaseException
with_traceback
args