tab-analysis.tab_analysis.analysis

This module analyses structure and relationships included in a tabular object (Pandas DataFrame, Dataset, list of list) :

  • Structure of a single field (class AnaField),
  • Relationship between two fields (class AnaRelation)
  • Structure and relationships of fields inside a dataset (class AnaDfield)
  • Structure of a dataset (class AnaDataset)

It contains two another classes Util, AnaError.

   1# -*- coding: utf-8 -*-
   2"""
   3This module analyses structure and relationships included in a tabular object
   4(Pandas DataFrame, Dataset, list of list) :
   5- Structure of a single field (class `AnaField`),
   6- Relationship between two fields (class `AnaRelation`)
   7- Structure and relationships of fields inside a dataset (class `AnaDfield`)
   8- Structure of a dataset (class `AnaDataset`)
   9
  10It contains two another classes `Util`, `AnaError`.
  11"""
  12import json
  13import pprint
  14from itertools import combinations
  15from operator import mul
  16from functools import reduce
  17
  18NULL = 'null'
  19UNIQUE = 'unique'
  20COMPLETE = 'complete'
  21FULL = 'full'
  22DEFAULT = 'default'
  23MIXED = 'mixed'
  24
  25COUPLED = 'coupled'
  26DERIVED = 'derived'
  27LINKED = 'linked'
  28CROSSED = 'crossed'
  29DISTRIBUTED = 'distributed'
  30ROOTED = 'rooted'
  31ROOT = 'root'
  32
  33IDFIELD = 'id'
  34MINCODEC = 'mincodec'
  35MAXCODEC = 'maxcodec'
  36LENCODEC = 'lencodec'
  37RATECODEC = 'ratecodec'
  38DMINCODEC = 'dmincodec'
  39DMAXCODEC = 'dmaxcodec'
  40RANCODEC = 'rancodec'
  41TYPECODEC = 'typecodec'
  42HASHF = 'hashf'
  43RELATION = 'relation'
  44HASHR = 'hashr'
  45DIST = 'dist'
  46DMAX = 'dmax'
  47DMIN = 'dmin'
  48DIFF = 'diff'
  49DRAN = 'dran'
  50NUM = 'num'
  51CATEGORY = 'category'
  52PDERIVED = 'pderived'
  53PDISTANCE = 'pdistance'
  54PDISTOMIN = 'pdistomin'
  55DISDISTANCE = 'disdistance'
  56DERDISTANCE = 'derdistance'
  57DISRATECPL = 'disratecpl'
  58DERRATECPL = 'derratecpl'
  59DISRATEDER = 'disrateder'
  60DERRATEDER = 'derrateder'
  61
  62TYPECOUPL = 'typecoupl'
  63PARENTCHILD = 'parentchild'
  64DISTANCE = 'distance'
  65DISTOMIN = 'distomin'
  66DISTOMAX = 'distomax'
  67DISTROOT = 'distroot'
  68RATECPL = 'ratecpl'
  69RATEDER = 'rateder'
  70
  71IDDATASET = 'name'
  72RELATIONS = 'relations'
  73FIELDS = 'fields'
  74LENGTH = 'length'
  75HASHD = 'hashd'
  76
  77
  78class AnaField:
  79    '''This class analyses field entities.
  80
  81    *Attributes*
  82
  83    - **idfield** : string - name or Id of the field
  84    - **lencodec**: integer - codec length
  85    - **mincodec**: integer - minimal codec length
  86    - **maxcodec**: integer - minimal codec length
  87    - **hashf**: integer - hash value to identify modifications
  88
  89    *characteristic (@property)*
  90
  91    - `iscomplete`
  92    - `ratecodec`
  93    - `dmincodec`
  94    - `dmaxcodec`
  95    - `rancodec`
  96    - `typecodec`
  97
  98    *instance methods*
  99
 100    - `to_dict`
 101
 102    '''
 103
 104    def __init__(self, idfield, lencodec=None, mincodec=None, maxcodec=None, hashf=None):
 105        '''Creation mode :
 106        - single dict attribute where keys are attributes name,
 107        - single AnaField attribute to make a copy
 108        - multiple attributes
 109
 110        *Parameters (multiple attributes)*
 111
 112        - **idfield** : string or integer - Id of the Field
 113        - **lencodec** : integer (default None) - length of the codec
 114        - **mincodec** : integer (default None) - number of different values
 115        - **maxcodec** : integer (default None) - length of the field
 116        - **hashf** : string (default None) - update identifier
 117
 118        *example*
 119
 120        AnaField is created with a dict
 121        >>> AnaField(Cfield([1,2,3,3]).to_analysis).to_dict()
 122        {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
 123        >>> AnaField({'lencodec': 4, 'mincodec': 3, 'maxcodec': 4})
 124        {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
 125
 126        AnaField is created with parameters
 127        >>> AnaField(lencodec=4, mincodec=3, maxcodec=4).to_dict()
 128        {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
 129        >>> AnaField(4, 3, 4).to_dict()
 130        {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
 131        '''
 132        if isinstance(idfield, dict):
 133            self.idfield = idfield.get(IDFIELD, None)
 134            self.lencodec = idfield.get(LENCODEC, None)
 135            self.mincodec = idfield.get(MINCODEC, None)
 136            self.maxcodec = idfield.get(MAXCODEC, None)
 137            self.hashf = idfield.get(HASHF, None)
 138            return
 139        if isinstance(idfield, (AnaField, AnaDfield)):
 140            self.idfield = idfield.idfield
 141            self.lencodec = idfield.lencodec
 142            self.mincodec = idfield.mincodec
 143            self.maxcodec = idfield.maxcodec
 144            self.hashf = idfield.hashf
 145            return
 146        if not lencodec or not isinstance(lencodec, int):
 147            raise AnaError("lencodec is not correct")
 148        self.idfield = idfield
 149        self.lencodec = lencodec
 150        self.mincodec = mincodec
 151        self.maxcodec = maxcodec
 152        self.hashf = hashf
 153
 154    def __len__(self):
 155        '''length of the field (maxcodec)'''
 156        return self.maxcodec if self.maxcodec else self.lencodec
 157
 158    def __repr__(self):
 159        '''representation of the field (class name + idfield)'''
 160        return self.__class__.__name__ + '(' + str(self.idfield) + ')'
 161
 162    def __eq__(self, other):
 163        ''' equal if class and attributes are equal'''
 164        return self.__class__ .__name__ == other.__class__.__name__ and \
 165            self.idfield == other.idfield and self.lencodec == other.lencodec and \
 166            self.mincodec == other.mincodec and self.maxcodec == other.maxcodec and \
 167            self.hashf == other.hashf
 168
 169    def __lt__(self, other):
 170        ''' return a comparison between hash value'''
 171        return hash(self) < hash(other)
 172
 173    def __hash__(self):
 174        '''return hash value (sum of attributes hash)'''
 175        return hash(self.idfield) + hash(self.lencodec) + hash(self.mincodec) \
 176            + hash(self.maxcodec) + hash(self.hashf)
 177
 178    def __str__(self):
 179        '''json-text build with the attributes dict'''
 180        return json.dumps(self.to_dict(idfield=True))
 181
 182    def __copy__(self):
 183        ''' Copy all the attributes '''
 184        return self.__class__(self)
 185
 186    def to_dict(self, full=False, idfield=False, notnone=True):
 187        '''return a dict with field attributes.
 188
 189         *Parameters*
 190
 191        - **full** : boolean (default False) - if True, all the attributes are included
 192        - **idfield** : boolean (default False) - if True, idfield is included
 193        - **notnone** : boolean (default True) - if True, None values are not included
 194        '''
 195        dic = {LENCODEC: self.lencodec, MINCODEC: self.mincodec,
 196               MAXCODEC: self.maxcodec}
 197        if idfield or full:
 198            dic[IDFIELD] = self.idfield
 199        if full:
 200            dic |= {RATECODEC: self.ratecodec, DMINCODEC: self.dmincodec,
 201                    DMAXCODEC: self.dmaxcodec, RANCODEC: self.rancodec,
 202                    TYPECODEC: self.typecodec}
 203        if notnone:
 204            return Util.reduce_dic(dic)
 205        return dic
 206
 207    @property
 208    def iscomplete(self):
 209        '''return boolean indicator : True if all attributes are present'''
 210        return not self.maxcodec is None and not self.mincodec is None
 211
 212    @property
 213    def ratecodec(self):
 214        '''return float ratecodec indicator'''
 215        if self.iscomplete and self.maxcodec - self.mincodec:
 216            return (self.maxcodec - self.lencodec) / (self.maxcodec - self.mincodec)
 217        return None
 218
 219    @property
 220    def dmincodec(self):
 221        '''return integer dmincodec indicator'''
 222        return self.lencodec - self.mincodec if self.iscomplete else None
 223
 224    @property
 225    def dmaxcodec(self):
 226        '''return integer dmaxcodec indicator'''
 227        return self.maxcodec - self.lencodec if self.iscomplete else None
 228
 229    @property
 230    def rancodec(self):
 231        '''return integer rancodec indicator'''
 232        return self.maxcodec - self.mincodec if self.iscomplete else None
 233
 234    @property
 235    def typecodec(self):
 236        '''return string typecodec indicator
 237        (null, unique, complete, full, default, mixed)
 238        '''
 239        if self.maxcodec is None or self.mincodec is None:
 240            return None
 241        if self.maxcodec == 0:
 242            return NULL
 243        if self.lencodec == 1:
 244            return UNIQUE
 245        if self.mincodec == self.maxcodec:
 246            return COMPLETE
 247        if self.lencodec == self.maxcodec:
 248            return FULL
 249        if self.lencodec == self.mincodec:
 250            return DEFAULT
 251        return MIXED
 252
 253
 254class AnaRelation:
 255    '''This class analyses relationship between two fields
 256
 257    *Attributes* :
 258
 259    - **relation** : List of the two fields involved in the relationship
 260    - **dist** : value of the relationship
 261    - **distrib** : boolean True if values are distributed
 262    - **hashr**: integer - hash value to identify update
 263
 264    *global (@property)*
 265
 266    - `id_relation`
 267    - `index_relation`
 268    - `parent_child`
 269    - `typecoupl`
 270
 271    *characteristic (@property)*
 272
 273    - `dmax`
 274    - `dmin`
 275    - `diff`
 276    - `dran`
 277    - `distomin`
 278    - `distomax`
 279    - `distance`
 280    - `ratecpl`
 281    - `rateder`
 282
 283    *instance methods*
 284
 285    - `to_dict`
 286    '''
 287
 288    def __init__(self, relation, dists, hashr=None):
 289        '''Constructor of the relationship :
 290
 291         *Parameters*
 292
 293        - **relation** : List of the two fields involved in the relationship
 294        - **dists** : dist value or list of dist value and distrib boolean
 295        - **distrib** : boolean True if values are distributed
 296        - **hashr**: integer - hash value to identify update
 297        '''
 298        self.relation = relation
 299        if isinstance(dists, list):
 300            self.dist = dists[0]
 301            self.distrib = dists[1]
 302        else:
 303            self.dist = dists
 304            self.distrib = None
 305        self.hashr = hashr
 306
 307    def __repr__(self):
 308        '''representation of the field (class name + idfield)'''
 309        return self.__class__.__name__ + '(' + str(self.id_relation) + ')'
 310
 311    def __str__(self):
 312        '''json-text build with the attributes dict'''
 313        return json.dumps(self.to_dict(relation=True))
 314
 315    def __eq__(self, other):
 316        ''' equal if class and values are equal'''
 317        return self.__class__ .__name__ == other.__class__.__name__ and \
 318            self.relation == other.relation and self.dist == other.dist and \
 319            self.hashr == other.hashr and self.distrib == other.distrib
 320
 321    def __hash__(self):
 322        '''return hash value (sum of attributes hash)'''
 323        return hash(self.relation[0]) + hash(self.relation[1]) + \
 324            hash(self.dist) + hash(self.hashr) + hash(self.distrib)
 325
 326    def to_dict(self, distances=False, full=False, mode='field', relation=False,
 327                notnone=True, misc=False):
 328        '''return a dict with AnaRelation attributes.
 329
 330         *Parameters*
 331
 332        - **distances** : boolean (default False) - if True, distances indicators are included
 333        - **full** : boolean (default False) - if True, all the attributes are included
 334        - **relation** : boolean (default False) - if True, idfield are included
 335        - **notnone** : boolean (default True) - if True, None values are not included
 336        - **mode** : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
 337        '''
 338        dic = {DIST: self.dist, TYPECOUPL: self.typecoupl, HASHR: self.hashr}
 339        if relation or full:
 340            dic[RELATION] = Util.view(self.relation, mode)
 341            dic[PARENTCHILD] = self.parent_child
 342        if distances or full:
 343            dic |= {DISTANCE: self.distance, DISTOMIN: self.distomin,
 344                    DISTOMAX: self.distomax, DISTRIBUTED: self.distrib,
 345                    RATECPL: self.ratecpl, RATEDER: self.rateder}
 346        if misc or full:
 347            dic |= {DMAX: self.dmax, DMIN: self.dmin,
 348                    DIFF: self.diff, DRAN: self.dran}
 349        if notnone:
 350            return Util.reduce_dic(dic)
 351        return dic
 352
 353    @property
 354    def id_relation(self):
 355        '''return a list with the id of the two fields involved'''
 356        if self.relation:
 357            return [fld.idfield for fld in self.relation]
 358        return []
 359
 360    @property
 361    def parent_child(self):
 362        '''returns the direction of the relationship (True if parent is first)'''
 363        rel0 = self.relation[0]
 364        rel1 = self.relation[1]
 365        return (rel0.lencodec > rel1.lencodec or
 366                (rel0.lencodec == rel1.lencodec and rel0.index < rel1.index))
 367
 368    @property
 369    def index_relation(self):
 370        '''return a list with the index of the two fields involved'''
 371        if self.relation:
 372            return [fld.index for fld in self.relation]
 373        return []
 374
 375    @property
 376    def dmax(self):
 377        '''return integer dmax indicator'''
 378        return self.relation[0].lencodec * self.relation[1].lencodec
 379
 380    @property
 381    def dmin(self):
 382        '''return integer dmin indicator'''
 383        return max(self.relation[0].lencodec, self.relation[1].lencodec)
 384
 385    @property
 386    def diff(self):
 387        '''return integer diff indicator'''
 388        return abs(self.relation[0].lencodec - self.relation[1].lencodec)
 389
 390    @property
 391    def dran(self):
 392        '''return integer dran indicator'''
 393        return self.dmax - self.dmin
 394
 395    @property
 396    def distomin(self):
 397        '''return integer distomin indicator'''
 398        return self.dist - self.dmin
 399
 400    @property
 401    def distomax(self):
 402        '''return integer distomax indicator'''
 403        return self.dmax - self.dist
 404
 405    @property
 406    def distance(self):
 407        '''return integer distance indicator'''
 408        return self.distomin + self.diff
 409
 410    @property
 411    def ratecpl(self):
 412        '''return float ratecpl indicator'''
 413        disdis = self.distance + self.distomax
 414        return 0 if disdis == 0 else self.distance / disdis
 415
 416    @property
 417    def rateder(self):
 418        '''return float rateder indicator'''
 419        return 0 if self.dran == 0 else self.distomin / self.dran
 420
 421    @property
 422    def typecoupl(self):
 423        '''return relationship type (coupled, derived, crossed, linked)'''
 424        if self.distance == 0:
 425            return COUPLED
 426        if self.distomin == 0:
 427            return DERIVED
 428        if self.distomax == 0:
 429            return CROSSED
 430        return LINKED
 431
 432
 433class AnaDfield(AnaField):
 434    '''This class analyses structure and relationships of fields inside a dataset
 435
 436    *Attributes* :
 437
 438    - **dataset** : AnaDataset object where AnaDfield is included
 439    - **AnaField attributes** : inheritance of AnaField object
 440
 441    *relationship (@property)*
 442
 443    - `list_relations`
 444    - `list_p_derived`
 445    - `list_c_derived`
 446    - `list_coupled`
 447
 448    *field (@property)*
 449
 450    - `fields`
 451    - `p_derived`
 452    - `p_distance`
 453    - `p_distomin`
 454
 455    *global (@property)*
 456
 457    - `index`
 458    - `dist_root`
 459    - `category`
 460
 461    *global (instance methods)*
 462
 463    - `ascendants`
 464    - `to_dict`
 465    - `view`
 466
 467    *other instance methods*
 468
 469    - `dic_inner_node`
 470    '''
 471    def __new__(cls, other, dataset=None):
 472        '''initialization of attributes from "other"'''
 473        if isinstance(other, AnaDfield):
 474            new = AnaDfield.__copy__(other)
 475            return new
 476        if isinstance(other, AnaField):
 477            new = AnaField.__copy__(other)
 478            new.__class__ = AnaDfield
 479            return new
 480        return object.__new__(cls)
 481
 482    def __init__(self, other, dataset):
 483        '''AnaDfield is created by adding a AnaDataset link to an AnaField object.
 484
 485         *Parameters*
 486
 487        - **other** : AnaField or AnaDfield to initialize attributes
 488        - **dataset** : AnaDataset which includes the AnaDfield
 489        '''
 490        self.dataset = dataset
 491
 492    def __copy__(self):
 493        ''' Copy all the data '''
 494        return self.__class__(AnaField(self), self.dataset)
 495
 496    def __lt__(self, other):
 497        ''' return a comparison between field index'''
 498        return self.index < other.index
 499
 500    @property
 501    def index(self):
 502        '''return the row of the field in the AnaDataset'''
 503        if self == self.dataset.root:
 504            return -1
 505        return self.dataset.fields.index(self)
 506
 507    @property
 508    def fields(self):
 509        '''return the list of the fields included in the AnaDataset'''
 510        return self.dataset.fields
 511
 512    @property
 513    def list_relations(self):
 514        '''return the list of the relations with the AnaDfield'''
 515        return list(self.dataset.relations[self].values())
 516
 517    @property
 518    def list_p_derived(self):
 519        '''return the list of the derived relations with the parents of AnaDfield'''
 520        return [rel for rel in self.list_relations if rel.typecoupl == DERIVED
 521                and not rel.parent_child]
 522
 523    @property
 524    def list_c_derived(self):
 525        '''return the list of the derived relations with the childs of AnaDfield'''
 526        return [rel for rel in self.list_relations if rel.typecoupl == DERIVED
 527                and rel.parent_child
 528                and rel.relation[1].category != UNIQUE]
 529
 530    @property
 531    def list_coupled(self):
 532        '''return the list of the coupled relations with the AnaDfield'''
 533        return [rel for rel in self.list_relations if rel.typecoupl == COUPLED]
 534
 535    @property
 536    def dist_root(self):
 537        '''return the distance to the root field'''
 538        return len(self.dataset) - self.lencodec
 539
 540    @property
 541    def category(self):
 542        '''return AnaDfield category (unique, rooted, coupled, derived, mixed)'''
 543        if self.typecodec == UNIQUE:
 544            return UNIQUE
 545        if self.typecodec in (COMPLETE, FULL):
 546            return ROOTED
 547        if COUPLED in [rel.typecoupl for rel in self.list_relations
 548                       if not rel.parent_child]:
 549            return COUPLED
 550        if not self.list_c_derived:
 551            return DERIVED
 552        return MIXED
 553
 554    @property
 555    def p_derived(self):
 556        '''return the first derived or coupled parent of the AnaDfield'''
 557        if self.category in (UNIQUE, ROOTED):
 558            return self.dataset.root
 559        if self.category == COUPLED:
 560            return [rel.relation[1] for rel in self.list_coupled
 561                    if not rel.relation[1].category == COUPLED][0]
 562        if not self.list_p_derived:
 563            return self.dataset.root
 564        distance_min = min(rel.distance for rel in self.list_p_derived)
 565        for rel in self.list_p_derived:
 566            if rel.distance == distance_min:
 567                if rel.relation[1].category == ROOTED:
 568                    return self.dataset.root
 569                if rel.relation[1].category == MIXED:
 570                    return rel.relation[1]
 571        return self.dataset.root
 572
 573    @property
 574    def p_distance(self):
 575        '''return the first parent with minimal distance of the AnaDfield'''
 576        return self._p_min_dist()
 577
 578    @property
 579    def p_distomin(self):
 580        '''return the first parent with minimal distomin of the AnaDfield'''
 581        return self._p_min_dist(False)
 582
 583    def _p_min_dist(self, distance=True):
 584        '''return the parent with minimal distance of the AnaDfield'''
 585        if self.category == UNIQUE:
 586            return self.dataset.root
 587        if distance:
 588            dist_up = [rel.distance for rel in self.list_relations if
 589                       not rel.parent_child]
 590        else:
 591            dist_up = [rel.distomin for rel in self.list_relations if
 592                       not rel.parent_child]
 593        if not dist_up or min(dist_up) == self.dist_root:
 594            return self.dataset.root
 595        dist_min = min(dist_up)
 596        if distance:
 597            list_dmin = [rel.relation[1] for rel in self.list_relations
 598                         if rel.distance == dist_min]
 599        else:
 600            list_dmin = [rel.relation[1] for rel in self.list_relations
 601                         if rel.distomin == dist_min]
 602        max_lencodec = max(fld.lencodec for fld in list_dmin)
 603        return [fld for fld in list_dmin if fld.lencodec == max_lencodec][0]
 604
 605    def to_dict(self, mode='id'):
 606        '''return a dict with field attributes.
 607
 608         *Parameters*
 609
 610        - **mode** : str (default 'id') - AnaDfield representation ('field', 'id', 'index')
 611        '''
 612        dic = super().to_dict(full=True, idfield=False, notnone=False)
 613        dic[DISTROOT] = self.dist_root
 614        dic[NUM] = self.index
 615        dic[CATEGORY] = self.category
 616        dic[PDISTANCE] = self.p_distance.view(mode)
 617        dic[PDISTOMIN] = self.p_distomin.view(mode)
 618        dic[PDERIVED] = self.p_derived.view(mode)
 619        return dic
 620
 621    def view(self, mode='field'):
 622        ''' return a representation of the AnaDfield
 623
 624         *Parameters*
 625
 626        - **mode** : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
 627        '''
 628        return Util.view(self, mode)
 629
 630    def ascendants(self, typeparent='derived', mode='field'):
 631        ''' return the list of the AnaDfield's ascendants in the family tree up to
 632        the root AnaDfield.
 633
 634         *Parameters*
 635
 636        - **typeparent** : str (default 'derived') - 'derived', 'distance' or 'distomin'
 637        - **mode** : str (default 'field') - AnaDfield representation
 638        ('field', 'id', 'index')
 639
 640        *Returns* : list of parents from closest to the most distant. Parents
 641        are represented with index, idfield, or object
 642        '''
 643        parent = self
 644        listparent = []
 645        while parent != self.dataset.root:
 646            if typeparent == 'derived':
 647                parent = parent.p_derived
 648            elif typeparent == 'distance':
 649                parent = parent.p_distance
 650            else:
 651                parent = parent.p_distomin
 652            if parent != self.dataset.root:
 653                listparent.append(parent)
 654        return Util.view(listparent, mode)
 655
 656    def dic_inner_node(self, mode, lname):
 657        '''return a child AnaDfield tree.
 658
 659         *Parameters*
 660
 661        - **lname** : integer - maximal length of the names
 662        - **mode** : string (default 'derived') - kind of tree :
 663            'derived' : derived tree
 664            'distance': min distance tree
 665            'distomin': min distomin tree
 666
 667        *Returns* : dict where key is a AnaDfield and value is the list of
 668        the childs "name ( dist - lencodec)".
 669        '''
 670        adding = ''
 671        if mode == 'distance':
 672            rel_parent = self.dataset.get_relation(self, self.p_distance)
 673            adding = str(rel_parent.distance) + ' - '
 674        elif mode == 'distomin':
 675            rel_parent = self.dataset.get_relation(self, self.p_distomin)
 676            adding = str(rel_parent.distomin) + ' - '
 677        elif mode == 'derived':
 678            rel_parent = self.dataset.get_relation(self, self.p_derived)
 679            adding = str(rel_parent.distance) + ' - '
 680        adding += str(self.lencodec)
 681        name = str(self.idfield)[:lname] + ' (' + adding + ')'
 682        lis = [name.replace(' ', '*').replace("'", '*')]
 683        if mode == 'derived':
 684            childs = []
 685            if not self.category in (ROOTED, COUPLED, UNIQUE):
 686                for rel in self.list_coupled:
 687                    lis.append(rel.relation[1].dic_inner_node(mode, lname))
 688            if not self.category in (ROOTED, UNIQUE):
 689                childs = [rel.relation[1] for rel in self.list_relations
 690                          if rel.relation[1].p_derived == self and
 691                          rel.relation[1].category != COUPLED]
 692        if mode == 'distomin':
 693            childs = [rel.relation[1] for rel in self.list_relations
 694                      if rel.relation[1].p_distomin == self]
 695        if mode == 'distance':
 696            childs = [rel.relation[1] for rel in self.list_relations
 697                      if rel.relation[1].p_distance == self]
 698        for fld in childs:
 699            lis.append(fld.dic_inner_node(mode, lname))
 700        return {str(self.index).ljust(2, '*'): lis}
 701
 702
 703class AnaDataset:
 704    '''This class analyses the structure of a dataset.
 705
 706    *Attributes* :
 707
 708    - **iddataset** : string or integer - Id of the Dataset
 709    - **fields** : list of the AnaDfields included
 710    - **relations** : dict of the AnaRelations between two AnaDfields
 711    - **hashd** : string - update identifier
 712
 713    *relationship (@property)*
 714
 715    - `ana_relations`
 716    - `p_relations`
 717
 718    *field (@property)*
 719
 720    - `root`
 721    - `primary`
 722    - `secondary`
 723    - `unique`
 724    - `mixte`
 725    - `variable`
 726
 727    *global (@property)*
 728
 729    - `category`
 730    - `complete`
 731    - `dimension`
 732
 733    *update (instance methods)*
 734
 735    - `set_relations`
 736
 737    *access (instance methods)*
 738
 739    - `get_relation`
 740    - `dfield`
 741
 742    *synthesis (instance methods)*
 743
 744    - `tree`
 745    - `to_dict`
 746    - `indicator`
 747    - `partitions`
 748    - `field_partition`
 749    - `relation_partition`
 750    '''
 751
 752    def __init__(self, fields=None, relations=None, iddataset=None,
 753                 leng=None, hashd=None):
 754        '''Creation mode :
 755        - single dict attribute where keys are attributes name,
 756        - single AnaDataset attribute to make a copy
 757        - multiple attributes
 758
 759        *Parameters (single dict)*
 760
 761        - **fields**: {'fields': list_of_dict, 'name': id_dataset,
 762                       'length': length, 'relations': dict_of_relations
 763            where:
 764                list_of_dict : {'id': id_field, 'lencodec': len_codec, 'mincodec': min_codec}
 765                id_field: string - name of field
 766                other_field: string - name of field
 767                len_codec: int - length of the codec
 768                min_codec: int - number of different codec values
 769                id_dataset : name of the dataset
 770                length: int - length of the dataset
 771                dict_of_relations: {id_field : {other_field: dist} for all fields}
 772                field: name of a field
 773                field_other: name of another field
 774                dist: integer (distance between the two fields) or
 775                array (distance and boolean distributed)
 776
 777        *Parameters (multiple attributes)*
 778
 779        - **fields**: list_of_dict
 780        - **iddataset** : string (default None) - id_dataset
 781        - **relations** : dict (default None) - dict_of_relations
 782        - **leng** : int (default None) - length
 783        - **hashd** : string (default None) - update identifier
 784        '''
 785        if isinstance(fields, AnaDataset):
 786            self.iddataset = fields.iddataset
 787            self.fields = fields.fields
 788            self.relations = fields.relations
 789            self.hashd = fields.hashd
 790            return
 791        if isinstance(fields, dict):
 792            iddataset = fields.get(IDDATASET, None)
 793            leng = fields.get(LENGTH, None)
 794            relations = fields.get(RELATIONS, None)
 795            hashd = fields.get(HASHD)
 796            fields = fields.get(FIELDS, None)
 797        self.iddataset = iddataset
 798        self.fields = [AnaDfield(AnaField(field), self)
 799                       for field in fields] if fields else []
 800        if leng:
 801            for fld in self.fields:
 802                fld.maxcodec = leng
 803        self.relations = {field: {} for field in self.fields}
 804        if relations:
 805            for fld, dic_relation in relations.items():
 806                self.set_relations(fld, dic_relation)
 807        self.hashd = hashd
 808
 809    def __len__(self):
 810        '''length of the AnaDataset (len of the AnaDfields included)'''
 811        return max(len(fld) for fld in self.fields)
 812
 813    def __eq__(self, other):
 814        ''' equal if class and values are equal'''
 815        return self.__class__ .__name__ == other.__class__.__name__ and \
 816            self.fields == other.fields and self.relations == other.relations and \
 817            self.iddataset == other.iddataset and self.hashd == other.hashd
 818
 819    def __hash__(self):
 820        '''return hash value (sum of attributes hash)'''
 821        return hash(self.iddataset) + sum(hash(fld) for fld in self.fields) + \
 822            sum(hash(rel) for rel in self.relations) + hash(self.hashd)
 823
 824    @property
 825    def category(self):
 826        '''return a list of AnaDfield category (unique, rooted, coupled, derived, mixed)'''
 827        return [fld.category for fld in self.fields]
 828
 829    @property
 830    def ana_relations(self):
 831        '''return the list of AnaRelation included'''
 832        return [rel for fldrel in self.relations.values() for rel in fldrel.values()]
 833
 834    @property
 835    def p_relations(self):
 836        '''return the list of oriented AnaRelation (parent first, child second)'''
 837        return [rel for rel in self.ana_relations if rel.parent_child]
 838
 839    @property
 840    def root(self):
 841        '''return the root AnaDfield'''
 842        len_self = len(self)
 843        return AnaDfield(AnaField(ROOT, len_self, len_self, len_self), self)
 844
 845    @property
 846    def primary(self):
 847        '''return the first partition of the partitions'''
 848        return self.field_partition(mode='field')['primary']
 849        # part = self.partitions(mode='field', distributed=True)
 850        # return part[0] if part else []
 851
 852    @property
 853    def complete(self):
 854        '''return True if the dimension is not 0'''
 855        return self.dimension > 0
 856
 857    @property
 858    def dimension(self):
 859        '''return the highest partition lenght'''
 860        return len(self.primary)
 861
 862    @property
 863    def secondary(self):
 864        '''return the derived ou coupled fields from primary'''
 865        return self.field_partition(mode='field')['secondary']
 866
 867    @property
 868    def unique(self):
 869        '''return the unique fields'''
 870        return [fld for fld in self.fields if fld.category == UNIQUE]
 871
 872    @property
 873    def variable(self):
 874        '''return the variable fields'''
 875        return self.field_partition(mode='field')['variable']
 876
 877    @property
 878    def mixte(self):
 879        '''return the variable fields'''
 880        return self.field_partition(mode='field')['mixte']
 881
 882    def set_relations(self, field, dic_relations):
 883        '''Add relations in the AnaDataset from a dict.
 884
 885         *Parameters*
 886
 887        - **field** : AnaDfield, AnaField or str (idfield) - first relation AnaDfield
 888        - **dic_relations** : dict - key is the second relation AnaDfield and
 889        value is the dist value or teh list [dist, distrib]
 890        '''
 891        fld = self.dfield(field)
 892        for other, dist in dic_relations.items():
 893            oth = self.dfield(other)
 894            self.relations[fld][oth] = AnaRelation([fld, oth], dist)
 895            self.relations[oth][fld] = AnaRelation([oth, fld], dist)
 896
 897    def get_relation(self, fld1, fld2):
 898        '''Return AnaRelation between fld1 and fld2.
 899
 900         *Parameters*
 901
 902        - **fld1** : AnaDfield, AnaField, int or str (idfield) - first relation AnaDfield
 903        - **fld2** : AnaDfield, AnaField, int or str (idfield) - second relation AnaDfield
 904        '''
 905        fl1 = self.dfield(fld1)
 906        fl2 = self.dfield(fld2)
 907        if self.root in [fl1, fl2]:
 908            return AnaRelation([fl1, fl2], len(self))
 909        return self.relations[self.dfield(fld1)][self.dfield(fld2)]
 910
 911    def dfield(self, fld):
 912        '''return the AnaDfield matching with fld. Fld is str, int, AnaDfield or AnaField'''
 913        if fld in (-1, ROOT):
 914            return self.root
 915        if isinstance(fld, AnaDfield):
 916            return fld
 917        if isinstance(fld, int):
 918            return self.fields[fld]
 919        if isinstance(fld, str):
 920            if fld in [dfld.idfield for dfld in self.fields]:
 921                return [dfld for dfld in self.fields if dfld.idfield == fld][0]
 922            return None
 923        return AnaDfield(fld, self)
 924
 925    def tree(self, mode='derived', width=5, lname=20, string=True):
 926        '''return a string with a tree of derived Field.
 927
 928         *Parameters*
 929
 930        - **lname** : integer (default 20) - length of the names
 931        - **width** : integer (default 5) - length of the lines
 932        - **string** : boolean (default True) - if True return str else return dict
 933        - **mode** : string (default 'derived') - kind of tree :
 934            'derived' : derived tree
 935            'distance': min distance tree
 936            'distomin': min distomin tree
 937        '''
 938        lis = ['root-' + mode + '*(' + str(len(self)) + ')']
 939        if mode == 'distance':
 940            childs = [fld for fld in self.fields if fld.p_distance == self.root]
 941        elif mode == 'distomin':
 942            childs = [fld for fld in self.fields if fld.p_distomin == self.root]
 943        elif mode == 'derived':
 944            childs = [fld for fld in self.fields if fld.p_derived == self.root]
 945        for fld in childs:
 946            lis.append(fld.dic_inner_node(mode, lname))
 947        tree = {str(-1).ljust(2, '*'): lis}
 948        if string:
 949            tre = pprint.pformat(tree, indent=0, width=width)
 950            tre = tre.replace('---', ' - ')
 951            tre = tre.replace('  ', ' ')
 952            tre = tre.replace('*', ' ')
 953            for car in ["'", "\"", "{", "[", "]", "}", ","]:
 954                tre = tre.replace(car, "")
 955            return tre
 956        return Util.clean_dic(tree, '*', ' ')
 957
 958    def to_dict(self, mode='field', keys=None, relations=False):
 959        '''return a dict with fields attributes and optionaly relations attributes.
 960
 961         *Parameters*
 962
 963        - **mode** : str (default 'field') - AnaDfield representation
 964        ('field', 'id', 'index')
 965        - **relations** : boolean (default: False) - if False return a list of fields,
 966        if True return a dict '{"fields": <list of fields>, "relations": <list of relations>}'
 967        - **keys** : string, list or tuple - list of keys or single key to return
 968        if 'all' or None, all keys are returned
 969        if list, only keys in list are returned
 970        if string, only values associated to the string(key) are returned'''
 971        fields = Util.filter_dic([fld.to_dict(mode=mode)
 972                                 for fld in self.fields], keys)
 973        leng = len(self.fields)
 974        if not relations:
 975            return fields
 976        return {'fields': fields, 'relations':
 977                [self.get_relation(i, j).to_dict(full=True, mode=mode)
 978                 for i in range(-1, leng) for j in range(i + 1, leng)]}
 979
 980    def partitions(self, mode='id', distributed=True):
 981        '''return a list of available partitions (the first is highest).
 982
 983         *Parameters*
 984
 985        - **mode** : str (default 'id') - AnaDfield representation
 986        ('field', 'id', 'index')
 987        - **distributed** : boolean (default True) - Include only distributed fields
 988        '''
 989        partit = [[fld] for fld in self.fields if fld.category == ROOTED]
 990        crossed = [rel for rel in self.ana_relations if rel.typecoupl == CROSSED
 991                   and rel.parent_child
 992                   and rel.relation[0].category != COUPLED
 993                   and rel.relation[1].category != COUPLED]
 994        if distributed:
 995            crossed = [rel for rel in crossed if rel.distrib]
 996        if crossed and len(crossed) == 1 and crossed[0].dist == len(self):
 997            partit.insert(0, crossed[0].relation)
 998        elif crossed:
 999            for repeat in list(range(len(crossed))):
1000                candidates = combinations(crossed, repeat + 1)
1001                for candidat in candidates:
1002                    flds = list(set(rel.relation[i]
1003                                for rel in candidat for i in [0, 1]))
1004                    if (reduce(mul, [fld.lencodec for fld in flds]) == len(self) and
1005                        len(candidat) == sum(range(len(flds))) and
1006                            (not distributed or min(rel.distrib for rel in candidat))):
1007                        partit.insert(0, flds)
1008        partit = [list(tup) for tup in
1009                  sorted(sorted(list({tuple(sorted(prt)) for prt in partit})),
1010                         key=len, reverse=True)]
1011        return Util.view(partit, mode)
1012
1013    def field_partition(self, mode='id', partition=None, distributed=True):
1014        '''return a partition dict with the list of primary, secondary, unique
1015        and variable fields.
1016
1017        *Parameters*
1018
1019        - **mode** : str (default 'id') - AnaDfield representation
1020        ('field', 'id', 'index')
1021        - **partition** : list of str, int, AnaDfield or AnaField(default None) -
1022        if None, partition is the first
1023        - **distributed** : boolean (default True) - Include only distributed fields
1024        '''
1025        partitions = self.partitions(mode='field', distributed=distributed)
1026        if not partitions:
1027            return Util.view(
1028                {'primary': [], 'secondary': [
1029                    fld for fld in self.fields if fld.category != UNIQUE],
1030                 'mixte': [], 'unique': [
1031                    fld for fld in self.fields if fld.category == UNIQUE],
1032                 'variable': []}, mode)
1033        if not partition:
1034            partition = partitions[0]
1035        else:
1036            # partition = [self.dfield(fld) for fld in tuple(sorted(partition))]
1037            partition = [self.dfield(fld) for fld in tuple(partition)]
1038        secondary = []
1039        for field in partition:
1040            self._add_child(field, secondary)
1041        secondary = [fld for fld in secondary if not fld in partition]
1042        unique = [fld for fld in self.fields if fld.category == UNIQUE]
1043        mixte = list(self._mixte_dims(partition, partitions))
1044        variable = [fld for fld in self.fields
1045                    if not fld in partition + secondary + unique + mixte]
1046        return Util.view({'primary': partition, 'secondary': secondary,
1047                          'mixte': mixte, 'unique': unique,
1048                          'variable': variable}, mode)
1049
1050    def relation_partition(self, partition=None, primary=False, noroot=False):
1051        '''return a dict with the list of relationships for fields in a partition.
1052
1053        *Parameters*
1054
1055        - **partition** : list (default None) - if None, partition is the first
1056        - **primary** : boolean (default False) - if True, relations are primary fields
1057        - **noroot** : boolean (default False) - if True and single primary,
1058        'root' field is replaced by the primary field'''
1059        partitions = self.partitions(mode='field')
1060        if not partitions:
1061            partition = None
1062        else:
1063            partition = Util.view(partition, mode='field',
1064                                  ana=self) if partition else partitions[0]
1065        part = self.field_partition(
1066            mode='field', partition=partition, distributed=True)
1067        fields_cat = {fld: cat for cat, l_fld in part.items() for fld in l_fld}
1068        relations = {}
1069        for field in fields_cat:
1070            rel = []
1071            match fields_cat[field]:
1072                case 'primary':
1073                    rel = [field.idfield]
1074                case 'unique': ...
1075                case 'variable':
1076                    rel = [fld.idfield for fld in part['primary']]
1077                case 'secondary' if not primary:
1078                    rel = [field.p_derived.idfield]
1079                case 'secondary' if primary:
1080                    rel = [fld.idfield for fld in field.ascendants()
1081                           if fld in part['primary']]
1082                case 'mixte':
1083                    rel = [fld.idfield for fld in self._mixte_dims(
1084                        partition, partitions)[field]]
1085                case _: ...
1086            if rel == ['root'] and len(part['primary']) == 1 and noroot:
1087                rel = [part['primary'][0].idfield]
1088            if rel == ['root'] and len(part['primary']) == 0 and noroot:
1089                rel = [part['secondary'][0].idfield]
1090            relations[field.idfield] = rel
1091        return relations
1092
1093    def indicator(self, fullsize, size):
1094        '''generate size indicators: ol (object lightness), ul (unicity level),
1095        gain (sizegain)
1096
1097        *Parameters*
1098
1099        - **fullsize** : int - size with full codec
1100        - **size** : int - size with existing codec
1101
1102        *Returns* : dict'''
1103        lenindex = len(self.fields)
1104        indexlen = sum(fld.lencodec for fld in self.fields)
1105        nval = len(self) * (lenindex + 1)
1106        sval = fullsize / nval
1107        ncod = indexlen + lenindex
1108
1109        if nval != ncod:
1110            scod = (size - ncod * sval) / (nval - ncod)
1111            olight = scod / sval
1112        else:
1113            olight = None
1114        return {'total values': nval, 'mean size': round(sval, 3),
1115                'unique values': ncod, 'mean coding size': round(scod, 3),
1116                'unicity level': round(ncod / nval, 3),
1117                'optimize level': round(size / fullsize, 3),
1118                'object lightness': round(olight, 3),
1119                'maxgain': round((nval - ncod) / nval, 3),
1120                'gain': round((fullsize - size) / fullsize, 3)}
1121
1122    def _add_child(self, field, childs):
1123        ''' add derived or coupled fields in the childs list'''
1124        for rel in field.list_c_derived + field.list_coupled:
1125            child = rel.relation[1]
1126            if not child in childs and not child.category == UNIQUE:
1127                childs.append(child)
1128                if not child.category in (COUPLED, UNIQUE):
1129                    self._add_child(child, childs)
1130
1131    def _mixte_dims(self, partition, partitions):
1132        '''return dict with dimensions associated to each mixte field'''
1133        dic_mixte = {}
1134        for part in partitions:
1135            not_part = [fld for fld in part if not fld in partition]
1136            if len(not_part) == 1 and len(partition) > len(part) > 1:
1137                sub_part = [fld for fld in partition if not fld in part]
1138                if min(self.get_relation(not_part[0], fld).typecoupl == 'derived'
1139                       for fld in sub_part) is True:
1140                    dic_mixte[not_part[0]] = sub_part
1141        return dic_mixte
1142
1143
1144class Util:
1145    ''' common functions for analysis package'''
1146
1147    @staticmethod
1148    def view(field_struc, mode, ana=None):
1149        ''' return a representation of a AnaDfields structure (field, id, index).
1150
1151         *Parameters*
1152
1153        - **mode** : str - AnaDfield representation ('field', 'id', 'index')
1154        - **field_struc** : list or dict - structure to represent
1155        - **ana** : AnaDataset (default None) - to convert string or index in AnaDfield
1156        '''
1157
1158        if mode is None or not field_struc:
1159            return field_struc
1160        if isinstance(field_struc, dict):
1161            return {key: Util.view(val, mode=mode, ana=ana)
1162                    for key, val in field_struc.items()}
1163        if isinstance(field_struc, list):
1164            return [Util.view(val, mode=mode, ana=ana) for val in field_struc]
1165        if not isinstance(field_struc, AnaDfield) and mode != 'id':
1166            return Util.view(ana.dfield(field_struc), mode=mode)
1167        return field_struc if mode == 'field' else (
1168            field_struc.index if mode == 'index' else field_struc.idfield)
1169
1170    @staticmethod
1171    def reduce_dic(obj, notempty=False):
1172        '''return a dict without None values'''
1173        if isinstance(obj, dict):
1174            return {key: Util.reduce_dic(val) for key, val in obj.items()
1175                    if not val is None and (not notempty or val)}
1176        if isinstance(obj, list):
1177            return [Util.reduce_dic(val) for val in obj]
1178        return obj
1179
1180    @staticmethod
1181    def clean_dic(obj, old, new):
1182        '''return a dict or list with updated strings by replacing "old" substring
1183        with "new" substring'''
1184        if isinstance(obj, dict):
1185            return {Util.clean_dic(key, old, new): Util.clean_dic(val, old, new)
1186                    for key, val in obj.items()}
1187        if isinstance(obj, str):
1188            return obj.replace(old, new)
1189        if isinstance(obj, list):
1190            return [Util.clean_dic(val, old, new) for val in obj]
1191        return obj
1192
1193    @staticmethod
1194    def filter_dic(obj, keys):
1195        '''return extract of a list of dict or of a dict
1196
1197         *Parameters*
1198
1199        - **keys** : string, list or tuple - list of keys or single key to return
1200        if 'all' or None, all keys are returned
1201        if list, only keys in list are returned
1202        if string, only values associated to the string(key) are returned'''
1203        if not keys or keys == 'all':
1204            return obj
1205        if isinstance(obj, list):
1206            return [Util.filter_dic(dic, keys) for dic in obj]
1207        if isinstance(keys, str) and isinstance(obj, dict):
1208            return obj.get(keys, None)
1209        if isinstance(keys, (list, tuple)) and isinstance(obj, dict):
1210            return {key: val for key, val in obj.items() if key in keys}
1211        return obj
1212
1213
1214class AnaError(Exception):
1215    ''' Analysis Exception'''
NULL = 'null'
UNIQUE = 'unique'
COMPLETE = 'complete'
FULL = 'full'
DEFAULT = 'default'
MIXED = 'mixed'
COUPLED = 'coupled'
DERIVED = 'derived'
LINKED = 'linked'
CROSSED = 'crossed'
DISTRIBUTED = 'distributed'
ROOTED = 'rooted'
ROOT = 'root'
IDFIELD = 'id'
MINCODEC = 'mincodec'
MAXCODEC = 'maxcodec'
LENCODEC = 'lencodec'
RATECODEC = 'ratecodec'
DMINCODEC = 'dmincodec'
DMAXCODEC = 'dmaxcodec'
RANCODEC = 'rancodec'
TYPECODEC = 'typecodec'
HASHF = 'hashf'
RELATION = 'relation'
HASHR = 'hashr'
DIST = 'dist'
DMAX = 'dmax'
DMIN = 'dmin'
DIFF = 'diff'
DRAN = 'dran'
NUM = 'num'
CATEGORY = 'category'
PDERIVED = 'pderived'
PDISTANCE = 'pdistance'
PDISTOMIN = 'pdistomin'
DISDISTANCE = 'disdistance'
DERDISTANCE = 'derdistance'
DISRATECPL = 'disratecpl'
DERRATECPL = 'derratecpl'
DISRATEDER = 'disrateder'
DERRATEDER = 'derrateder'
TYPECOUPL = 'typecoupl'
PARENTCHILD = 'parentchild'
DISTANCE = 'distance'
DISTOMIN = 'distomin'
DISTOMAX = 'distomax'
DISTROOT = 'distroot'
RATECPL = 'ratecpl'
RATEDER = 'rateder'
IDDATASET = 'name'
RELATIONS = 'relations'
FIELDS = 'fields'
LENGTH = 'length'
HASHD = 'hashd'
class AnaField:
 79class AnaField:
 80    '''This class analyses field entities.
 81
 82    *Attributes*
 83
 84    - **idfield** : string - name or Id of the field
 85    - **lencodec**: integer - codec length
 86    - **mincodec**: integer - minimal codec length
 87    - **maxcodec**: integer - minimal codec length
 88    - **hashf**: integer - hash value to identify modifications
 89
 90    *characteristic (@property)*
 91
 92    - `iscomplete`
 93    - `ratecodec`
 94    - `dmincodec`
 95    - `dmaxcodec`
 96    - `rancodec`
 97    - `typecodec`
 98
 99    *instance methods*
100
101    - `to_dict`
102
103    '''
104
105    def __init__(self, idfield, lencodec=None, mincodec=None, maxcodec=None, hashf=None):
106        '''Creation mode :
107        - single dict attribute where keys are attributes name,
108        - single AnaField attribute to make a copy
109        - multiple attributes
110
111        *Parameters (multiple attributes)*
112
113        - **idfield** : string or integer - Id of the Field
114        - **lencodec** : integer (default None) - length of the codec
115        - **mincodec** : integer (default None) - number of different values
116        - **maxcodec** : integer (default None) - length of the field
117        - **hashf** : string (default None) - update identifier
118
119        *example*
120
121        AnaField is created with a dict
122        >>> AnaField(Cfield([1,2,3,3]).to_analysis).to_dict()
123        {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
124        >>> AnaField({'lencodec': 4, 'mincodec': 3, 'maxcodec': 4})
125        {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
126
127        AnaField is created with parameters
128        >>> AnaField(lencodec=4, mincodec=3, maxcodec=4).to_dict()
129        {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
130        >>> AnaField(4, 3, 4).to_dict()
131        {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
132        '''
133        if isinstance(idfield, dict):
134            self.idfield = idfield.get(IDFIELD, None)
135            self.lencodec = idfield.get(LENCODEC, None)
136            self.mincodec = idfield.get(MINCODEC, None)
137            self.maxcodec = idfield.get(MAXCODEC, None)
138            self.hashf = idfield.get(HASHF, None)
139            return
140        if isinstance(idfield, (AnaField, AnaDfield)):
141            self.idfield = idfield.idfield
142            self.lencodec = idfield.lencodec
143            self.mincodec = idfield.mincodec
144            self.maxcodec = idfield.maxcodec
145            self.hashf = idfield.hashf
146            return
147        if not lencodec or not isinstance(lencodec, int):
148            raise AnaError("lencodec is not correct")
149        self.idfield = idfield
150        self.lencodec = lencodec
151        self.mincodec = mincodec
152        self.maxcodec = maxcodec
153        self.hashf = hashf
154
155    def __len__(self):
156        '''length of the field (maxcodec)'''
157        return self.maxcodec if self.maxcodec else self.lencodec
158
159    def __repr__(self):
160        '''representation of the field (class name + idfield)'''
161        return self.__class__.__name__ + '(' + str(self.idfield) + ')'
162
163    def __eq__(self, other):
164        ''' equal if class and attributes are equal'''
165        return self.__class__ .__name__ == other.__class__.__name__ and \
166            self.idfield == other.idfield and self.lencodec == other.lencodec and \
167            self.mincodec == other.mincodec and self.maxcodec == other.maxcodec and \
168            self.hashf == other.hashf
169
170    def __lt__(self, other):
171        ''' return a comparison between hash value'''
172        return hash(self) < hash(other)
173
174    def __hash__(self):
175        '''return hash value (sum of attributes hash)'''
176        return hash(self.idfield) + hash(self.lencodec) + hash(self.mincodec) \
177            + hash(self.maxcodec) + hash(self.hashf)
178
179    def __str__(self):
180        '''json-text build with the attributes dict'''
181        return json.dumps(self.to_dict(idfield=True))
182
183    def __copy__(self):
184        ''' Copy all the attributes '''
185        return self.__class__(self)
186
187    def to_dict(self, full=False, idfield=False, notnone=True):
188        '''return a dict with field attributes.
189
190         *Parameters*
191
192        - **full** : boolean (default False) - if True, all the attributes are included
193        - **idfield** : boolean (default False) - if True, idfield is included
194        - **notnone** : boolean (default True) - if True, None values are not included
195        '''
196        dic = {LENCODEC: self.lencodec, MINCODEC: self.mincodec,
197               MAXCODEC: self.maxcodec}
198        if idfield or full:
199            dic[IDFIELD] = self.idfield
200        if full:
201            dic |= {RATECODEC: self.ratecodec, DMINCODEC: self.dmincodec,
202                    DMAXCODEC: self.dmaxcodec, RANCODEC: self.rancodec,
203                    TYPECODEC: self.typecodec}
204        if notnone:
205            return Util.reduce_dic(dic)
206        return dic
207
208    @property
209    def iscomplete(self):
210        '''return boolean indicator : True if all attributes are present'''
211        return not self.maxcodec is None and not self.mincodec is None
212
213    @property
214    def ratecodec(self):
215        '''return float ratecodec indicator'''
216        if self.iscomplete and self.maxcodec - self.mincodec:
217            return (self.maxcodec - self.lencodec) / (self.maxcodec - self.mincodec)
218        return None
219
220    @property
221    def dmincodec(self):
222        '''return integer dmincodec indicator'''
223        return self.lencodec - self.mincodec if self.iscomplete else None
224
225    @property
226    def dmaxcodec(self):
227        '''return integer dmaxcodec indicator'''
228        return self.maxcodec - self.lencodec if self.iscomplete else None
229
230    @property
231    def rancodec(self):
232        '''return integer rancodec indicator'''
233        return self.maxcodec - self.mincodec if self.iscomplete else None
234
235    @property
236    def typecodec(self):
237        '''return string typecodec indicator
238        (null, unique, complete, full, default, mixed)
239        '''
240        if self.maxcodec is None or self.mincodec is None:
241            return None
242        if self.maxcodec == 0:
243            return NULL
244        if self.lencodec == 1:
245            return UNIQUE
246        if self.mincodec == self.maxcodec:
247            return COMPLETE
248        if self.lencodec == self.maxcodec:
249            return FULL
250        if self.lencodec == self.mincodec:
251            return DEFAULT
252        return MIXED

This class analyses field entities.

Attributes

  • idfield : string - name or Id of the field
  • lencodec: integer - codec length
  • mincodec: integer - minimal codec length
  • maxcodec: integer - minimal codec length
  • hashf: integer - hash value to identify modifications

characteristic (@property)

instance methods

AnaField(idfield, lencodec=None, mincodec=None, maxcodec=None, hashf=None)
105    def __init__(self, idfield, lencodec=None, mincodec=None, maxcodec=None, hashf=None):
106        '''Creation mode :
107        - single dict attribute where keys are attributes name,
108        - single AnaField attribute to make a copy
109        - multiple attributes
110
111        *Parameters (multiple attributes)*
112
113        - **idfield** : string or integer - Id of the Field
114        - **lencodec** : integer (default None) - length of the codec
115        - **mincodec** : integer (default None) - number of different values
116        - **maxcodec** : integer (default None) - length of the field
117        - **hashf** : string (default None) - update identifier
118
119        *example*
120
121        AnaField is created with a dict
122        >>> AnaField(Cfield([1,2,3,3]).to_analysis).to_dict()
123        {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
124        >>> AnaField({'lencodec': 4, 'mincodec': 3, 'maxcodec': 4})
125        {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
126
127        AnaField is created with parameters
128        >>> AnaField(lencodec=4, mincodec=3, maxcodec=4).to_dict()
129        {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
130        >>> AnaField(4, 3, 4).to_dict()
131        {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
132        '''
133        if isinstance(idfield, dict):
134            self.idfield = idfield.get(IDFIELD, None)
135            self.lencodec = idfield.get(LENCODEC, None)
136            self.mincodec = idfield.get(MINCODEC, None)
137            self.maxcodec = idfield.get(MAXCODEC, None)
138            self.hashf = idfield.get(HASHF, None)
139            return
140        if isinstance(idfield, (AnaField, AnaDfield)):
141            self.idfield = idfield.idfield
142            self.lencodec = idfield.lencodec
143            self.mincodec = idfield.mincodec
144            self.maxcodec = idfield.maxcodec
145            self.hashf = idfield.hashf
146            return
147        if not lencodec or not isinstance(lencodec, int):
148            raise AnaError("lencodec is not correct")
149        self.idfield = idfield
150        self.lencodec = lencodec
151        self.mincodec = mincodec
152        self.maxcodec = maxcodec
153        self.hashf = hashf

Creation mode :

  • single dict attribute where keys are attributes name,
  • single AnaField attribute to make a copy
  • multiple attributes

Parameters (multiple attributes)

  • idfield : string or integer - Id of the Field
  • lencodec : integer (default None) - length of the codec
  • mincodec : integer (default None) - number of different values
  • maxcodec : integer (default None) - length of the field
  • hashf : string (default None) - update identifier

example

AnaField is created with a dict

>>> AnaField(Cfield([1,2,3,3]).to_analysis).to_dict()
{'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
>>> AnaField({'lencodec': 4, 'mincodec': 3, 'maxcodec': 4})
{'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}

AnaField is created with parameters

>>> AnaField(lencodec=4, mincodec=3, maxcodec=4).to_dict()
{'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
>>> AnaField(4, 3, 4).to_dict()
{'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
idfield
lencodec
mincodec
maxcodec
hashf
def to_dict(self, full=False, idfield=False, notnone=True):
187    def to_dict(self, full=False, idfield=False, notnone=True):
188        '''return a dict with field attributes.
189
190         *Parameters*
191
192        - **full** : boolean (default False) - if True, all the attributes are included
193        - **idfield** : boolean (default False) - if True, idfield is included
194        - **notnone** : boolean (default True) - if True, None values are not included
195        '''
196        dic = {LENCODEC: self.lencodec, MINCODEC: self.mincodec,
197               MAXCODEC: self.maxcodec}
198        if idfield or full:
199            dic[IDFIELD] = self.idfield
200        if full:
201            dic |= {RATECODEC: self.ratecodec, DMINCODEC: self.dmincodec,
202                    DMAXCODEC: self.dmaxcodec, RANCODEC: self.rancodec,
203                    TYPECODEC: self.typecodec}
204        if notnone:
205            return Util.reduce_dic(dic)
206        return dic

return a dict with field attributes.

Parameters

  • full : boolean (default False) - if True, all the attributes are included
  • idfield : boolean (default False) - if True, idfield is included
  • notnone : boolean (default True) - if True, None values are not included
iscomplete
208    @property
209    def iscomplete(self):
210        '''return boolean indicator : True if all attributes are present'''
211        return not self.maxcodec is None and not self.mincodec is None

return boolean indicator : True if all attributes are present

ratecodec
213    @property
214    def ratecodec(self):
215        '''return float ratecodec indicator'''
216        if self.iscomplete and self.maxcodec - self.mincodec:
217            return (self.maxcodec - self.lencodec) / (self.maxcodec - self.mincodec)
218        return None

return float ratecodec indicator

dmincodec
220    @property
221    def dmincodec(self):
222        '''return integer dmincodec indicator'''
223        return self.lencodec - self.mincodec if self.iscomplete else None

return integer dmincodec indicator

dmaxcodec
225    @property
226    def dmaxcodec(self):
227        '''return integer dmaxcodec indicator'''
228        return self.maxcodec - self.lencodec if self.iscomplete else None

return integer dmaxcodec indicator

rancodec
230    @property
231    def rancodec(self):
232        '''return integer rancodec indicator'''
233        return self.maxcodec - self.mincodec if self.iscomplete else None

return integer rancodec indicator

typecodec
235    @property
236    def typecodec(self):
237        '''return string typecodec indicator
238        (null, unique, complete, full, default, mixed)
239        '''
240        if self.maxcodec is None or self.mincodec is None:
241            return None
242        if self.maxcodec == 0:
243            return NULL
244        if self.lencodec == 1:
245            return UNIQUE
246        if self.mincodec == self.maxcodec:
247            return COMPLETE
248        if self.lencodec == self.maxcodec:
249            return FULL
250        if self.lencodec == self.mincodec:
251            return DEFAULT
252        return MIXED

return string typecodec indicator (null, unique, complete, full, default, mixed)

class AnaRelation:
255class AnaRelation:
256    '''This class analyses relationship between two fields
257
258    *Attributes* :
259
260    - **relation** : List of the two fields involved in the relationship
261    - **dist** : value of the relationship
262    - **distrib** : boolean True if values are distributed
263    - **hashr**: integer - hash value to identify update
264
265    *global (@property)*
266
267    - `id_relation`
268    - `index_relation`
269    - `parent_child`
270    - `typecoupl`
271
272    *characteristic (@property)*
273
274    - `dmax`
275    - `dmin`
276    - `diff`
277    - `dran`
278    - `distomin`
279    - `distomax`
280    - `distance`
281    - `ratecpl`
282    - `rateder`
283
284    *instance methods*
285
286    - `to_dict`
287    '''
288
289    def __init__(self, relation, dists, hashr=None):
290        '''Constructor of the relationship :
291
292         *Parameters*
293
294        - **relation** : List of the two fields involved in the relationship
295        - **dists** : dist value or list of dist value and distrib boolean
296        - **distrib** : boolean True if values are distributed
297        - **hashr**: integer - hash value to identify update
298        '''
299        self.relation = relation
300        if isinstance(dists, list):
301            self.dist = dists[0]
302            self.distrib = dists[1]
303        else:
304            self.dist = dists
305            self.distrib = None
306        self.hashr = hashr
307
308    def __repr__(self):
309        '''representation of the field (class name + idfield)'''
310        return self.__class__.__name__ + '(' + str(self.id_relation) + ')'
311
312    def __str__(self):
313        '''json-text build with the attributes dict'''
314        return json.dumps(self.to_dict(relation=True))
315
316    def __eq__(self, other):
317        ''' equal if class and values are equal'''
318        return self.__class__ .__name__ == other.__class__.__name__ and \
319            self.relation == other.relation and self.dist == other.dist and \
320            self.hashr == other.hashr and self.distrib == other.distrib
321
322    def __hash__(self):
323        '''return hash value (sum of attributes hash)'''
324        return hash(self.relation[0]) + hash(self.relation[1]) + \
325            hash(self.dist) + hash(self.hashr) + hash(self.distrib)
326
327    def to_dict(self, distances=False, full=False, mode='field', relation=False,
328                notnone=True, misc=False):
329        '''return a dict with AnaRelation attributes.
330
331         *Parameters*
332
333        - **distances** : boolean (default False) - if True, distances indicators are included
334        - **full** : boolean (default False) - if True, all the attributes are included
335        - **relation** : boolean (default False) - if True, idfield are included
336        - **notnone** : boolean (default True) - if True, None values are not included
337        - **mode** : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
338        '''
339        dic = {DIST: self.dist, TYPECOUPL: self.typecoupl, HASHR: self.hashr}
340        if relation or full:
341            dic[RELATION] = Util.view(self.relation, mode)
342            dic[PARENTCHILD] = self.parent_child
343        if distances or full:
344            dic |= {DISTANCE: self.distance, DISTOMIN: self.distomin,
345                    DISTOMAX: self.distomax, DISTRIBUTED: self.distrib,
346                    RATECPL: self.ratecpl, RATEDER: self.rateder}
347        if misc or full:
348            dic |= {DMAX: self.dmax, DMIN: self.dmin,
349                    DIFF: self.diff, DRAN: self.dran}
350        if notnone:
351            return Util.reduce_dic(dic)
352        return dic
353
354    @property
355    def id_relation(self):
356        '''return a list with the id of the two fields involved'''
357        if self.relation:
358            return [fld.idfield for fld in self.relation]
359        return []
360
361    @property
362    def parent_child(self):
363        '''returns the direction of the relationship (True if parent is first)'''
364        rel0 = self.relation[0]
365        rel1 = self.relation[1]
366        return (rel0.lencodec > rel1.lencodec or
367                (rel0.lencodec == rel1.lencodec and rel0.index < rel1.index))
368
369    @property
370    def index_relation(self):
371        '''return a list with the index of the two fields involved'''
372        if self.relation:
373            return [fld.index for fld in self.relation]
374        return []
375
376    @property
377    def dmax(self):
378        '''return integer dmax indicator'''
379        return self.relation[0].lencodec * self.relation[1].lencodec
380
381    @property
382    def dmin(self):
383        '''return integer dmin indicator'''
384        return max(self.relation[0].lencodec, self.relation[1].lencodec)
385
386    @property
387    def diff(self):
388        '''return integer diff indicator'''
389        return abs(self.relation[0].lencodec - self.relation[1].lencodec)
390
391    @property
392    def dran(self):
393        '''return integer dran indicator'''
394        return self.dmax - self.dmin
395
396    @property
397    def distomin(self):
398        '''return integer distomin indicator'''
399        return self.dist - self.dmin
400
401    @property
402    def distomax(self):
403        '''return integer distomax indicator'''
404        return self.dmax - self.dist
405
406    @property
407    def distance(self):
408        '''return integer distance indicator'''
409        return self.distomin + self.diff
410
411    @property
412    def ratecpl(self):
413        '''return float ratecpl indicator'''
414        disdis = self.distance + self.distomax
415        return 0 if disdis == 0 else self.distance / disdis
416
417    @property
418    def rateder(self):
419        '''return float rateder indicator'''
420        return 0 if self.dran == 0 else self.distomin / self.dran
421
422    @property
423    def typecoupl(self):
424        '''return relationship type (coupled, derived, crossed, linked)'''
425        if self.distance == 0:
426            return COUPLED
427        if self.distomin == 0:
428            return DERIVED
429        if self.distomax == 0:
430            return CROSSED
431        return LINKED

This class analyses relationship between two fields

Attributes :

  • relation : List of the two fields involved in the relationship
  • dist : value of the relationship
  • distrib : boolean True if values are distributed
  • hashr: integer - hash value to identify update

global (@property)

characteristic (@property)

instance methods

AnaRelation(relation, dists, hashr=None)
289    def __init__(self, relation, dists, hashr=None):
290        '''Constructor of the relationship :
291
292         *Parameters*
293
294        - **relation** : List of the two fields involved in the relationship
295        - **dists** : dist value or list of dist value and distrib boolean
296        - **distrib** : boolean True if values are distributed
297        - **hashr**: integer - hash value to identify update
298        '''
299        self.relation = relation
300        if isinstance(dists, list):
301            self.dist = dists[0]
302            self.distrib = dists[1]
303        else:
304            self.dist = dists
305            self.distrib = None
306        self.hashr = hashr

Constructor of the relationship :

Parameters

  • relation : List of the two fields involved in the relationship
  • dists : dist value or list of dist value and distrib boolean
  • distrib : boolean True if values are distributed
  • hashr: integer - hash value to identify update
relation
hashr
def to_dict( self, distances=False, full=False, mode='field', relation=False, notnone=True, misc=False):
327    def to_dict(self, distances=False, full=False, mode='field', relation=False,
328                notnone=True, misc=False):
329        '''return a dict with AnaRelation attributes.
330
331         *Parameters*
332
333        - **distances** : boolean (default False) - if True, distances indicators are included
334        - **full** : boolean (default False) - if True, all the attributes are included
335        - **relation** : boolean (default False) - if True, idfield are included
336        - **notnone** : boolean (default True) - if True, None values are not included
337        - **mode** : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
338        '''
339        dic = {DIST: self.dist, TYPECOUPL: self.typecoupl, HASHR: self.hashr}
340        if relation or full:
341            dic[RELATION] = Util.view(self.relation, mode)
342            dic[PARENTCHILD] = self.parent_child
343        if distances or full:
344            dic |= {DISTANCE: self.distance, DISTOMIN: self.distomin,
345                    DISTOMAX: self.distomax, DISTRIBUTED: self.distrib,
346                    RATECPL: self.ratecpl, RATEDER: self.rateder}
347        if misc or full:
348            dic |= {DMAX: self.dmax, DMIN: self.dmin,
349                    DIFF: self.diff, DRAN: self.dran}
350        if notnone:
351            return Util.reduce_dic(dic)
352        return dic

return a dict with AnaRelation attributes.

Parameters

  • distances : boolean (default False) - if True, distances indicators are included
  • full : boolean (default False) - if True, all the attributes are included
  • relation : boolean (default False) - if True, idfield are included
  • notnone : boolean (default True) - if True, None values are not included
  • mode : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
id_relation
354    @property
355    def id_relation(self):
356        '''return a list with the id of the two fields involved'''
357        if self.relation:
358            return [fld.idfield for fld in self.relation]
359        return []

return a list with the id of the two fields involved

parent_child
361    @property
362    def parent_child(self):
363        '''returns the direction of the relationship (True if parent is first)'''
364        rel0 = self.relation[0]
365        rel1 = self.relation[1]
366        return (rel0.lencodec > rel1.lencodec or
367                (rel0.lencodec == rel1.lencodec and rel0.index < rel1.index))

returns the direction of the relationship (True if parent is first)

index_relation
369    @property
370    def index_relation(self):
371        '''return a list with the index of the two fields involved'''
372        if self.relation:
373            return [fld.index for fld in self.relation]
374        return []

return a list with the index of the two fields involved

dmax
376    @property
377    def dmax(self):
378        '''return integer dmax indicator'''
379        return self.relation[0].lencodec * self.relation[1].lencodec

return integer dmax indicator

dmin
381    @property
382    def dmin(self):
383        '''return integer dmin indicator'''
384        return max(self.relation[0].lencodec, self.relation[1].lencodec)

return integer dmin indicator

diff
386    @property
387    def diff(self):
388        '''return integer diff indicator'''
389        return abs(self.relation[0].lencodec - self.relation[1].lencodec)

return integer diff indicator

dran
391    @property
392    def dran(self):
393        '''return integer dran indicator'''
394        return self.dmax - self.dmin

return integer dran indicator

distomin
396    @property
397    def distomin(self):
398        '''return integer distomin indicator'''
399        return self.dist - self.dmin

return integer distomin indicator

distomax
401    @property
402    def distomax(self):
403        '''return integer distomax indicator'''
404        return self.dmax - self.dist

return integer distomax indicator

distance
406    @property
407    def distance(self):
408        '''return integer distance indicator'''
409        return self.distomin + self.diff

return integer distance indicator

ratecpl
411    @property
412    def ratecpl(self):
413        '''return float ratecpl indicator'''
414        disdis = self.distance + self.distomax
415        return 0 if disdis == 0 else self.distance / disdis

return float ratecpl indicator

rateder
417    @property
418    def rateder(self):
419        '''return float rateder indicator'''
420        return 0 if self.dran == 0 else self.distomin / self.dran

return float rateder indicator

typecoupl
422    @property
423    def typecoupl(self):
424        '''return relationship type (coupled, derived, crossed, linked)'''
425        if self.distance == 0:
426            return COUPLED
427        if self.distomin == 0:
428            return DERIVED
429        if self.distomax == 0:
430            return CROSSED
431        return LINKED

return relationship type (coupled, derived, crossed, linked)

class AnaDfield(AnaField):
434class AnaDfield(AnaField):
435    '''This class analyses structure and relationships of fields inside a dataset
436
437    *Attributes* :
438
439    - **dataset** : AnaDataset object where AnaDfield is included
440    - **AnaField attributes** : inheritance of AnaField object
441
442    *relationship (@property)*
443
444    - `list_relations`
445    - `list_p_derived`
446    - `list_c_derived`
447    - `list_coupled`
448
449    *field (@property)*
450
451    - `fields`
452    - `p_derived`
453    - `p_distance`
454    - `p_distomin`
455
456    *global (@property)*
457
458    - `index`
459    - `dist_root`
460    - `category`
461
462    *global (instance methods)*
463
464    - `ascendants`
465    - `to_dict`
466    - `view`
467
468    *other instance methods*
469
470    - `dic_inner_node`
471    '''
472    def __new__(cls, other, dataset=None):
473        '''initialization of attributes from "other"'''
474        if isinstance(other, AnaDfield):
475            new = AnaDfield.__copy__(other)
476            return new
477        if isinstance(other, AnaField):
478            new = AnaField.__copy__(other)
479            new.__class__ = AnaDfield
480            return new
481        return object.__new__(cls)
482
483    def __init__(self, other, dataset):
484        '''AnaDfield is created by adding a AnaDataset link to an AnaField object.
485
486         *Parameters*
487
488        - **other** : AnaField or AnaDfield to initialize attributes
489        - **dataset** : AnaDataset which includes the AnaDfield
490        '''
491        self.dataset = dataset
492
493    def __copy__(self):
494        ''' Copy all the data '''
495        return self.__class__(AnaField(self), self.dataset)
496
497    def __lt__(self, other):
498        ''' return a comparison between field index'''
499        return self.index < other.index
500
501    @property
502    def index(self):
503        '''return the row of the field in the AnaDataset'''
504        if self == self.dataset.root:
505            return -1
506        return self.dataset.fields.index(self)
507
508    @property
509    def fields(self):
510        '''return the list of the fields included in the AnaDataset'''
511        return self.dataset.fields
512
513    @property
514    def list_relations(self):
515        '''return the list of the relations with the AnaDfield'''
516        return list(self.dataset.relations[self].values())
517
518    @property
519    def list_p_derived(self):
520        '''return the list of the derived relations with the parents of AnaDfield'''
521        return [rel for rel in self.list_relations if rel.typecoupl == DERIVED
522                and not rel.parent_child]
523
524    @property
525    def list_c_derived(self):
526        '''return the list of the derived relations with the childs of AnaDfield'''
527        return [rel for rel in self.list_relations if rel.typecoupl == DERIVED
528                and rel.parent_child
529                and rel.relation[1].category != UNIQUE]
530
531    @property
532    def list_coupled(self):
533        '''return the list of the coupled relations with the AnaDfield'''
534        return [rel for rel in self.list_relations if rel.typecoupl == COUPLED]
535
536    @property
537    def dist_root(self):
538        '''return the distance to the root field'''
539        return len(self.dataset) - self.lencodec
540
541    @property
542    def category(self):
543        '''return AnaDfield category (unique, rooted, coupled, derived, mixed)'''
544        if self.typecodec == UNIQUE:
545            return UNIQUE
546        if self.typecodec in (COMPLETE, FULL):
547            return ROOTED
548        if COUPLED in [rel.typecoupl for rel in self.list_relations
549                       if not rel.parent_child]:
550            return COUPLED
551        if not self.list_c_derived:
552            return DERIVED
553        return MIXED
554
555    @property
556    def p_derived(self):
557        '''return the first derived or coupled parent of the AnaDfield'''
558        if self.category in (UNIQUE, ROOTED):
559            return self.dataset.root
560        if self.category == COUPLED:
561            return [rel.relation[1] for rel in self.list_coupled
562                    if not rel.relation[1].category == COUPLED][0]
563        if not self.list_p_derived:
564            return self.dataset.root
565        distance_min = min(rel.distance for rel in self.list_p_derived)
566        for rel in self.list_p_derived:
567            if rel.distance == distance_min:
568                if rel.relation[1].category == ROOTED:
569                    return self.dataset.root
570                if rel.relation[1].category == MIXED:
571                    return rel.relation[1]
572        return self.dataset.root
573
574    @property
575    def p_distance(self):
576        '''return the first parent with minimal distance of the AnaDfield'''
577        return self._p_min_dist()
578
579    @property
580    def p_distomin(self):
581        '''return the first parent with minimal distomin of the AnaDfield'''
582        return self._p_min_dist(False)
583
584    def _p_min_dist(self, distance=True):
585        '''return the parent with minimal distance of the AnaDfield'''
586        if self.category == UNIQUE:
587            return self.dataset.root
588        if distance:
589            dist_up = [rel.distance for rel in self.list_relations if
590                       not rel.parent_child]
591        else:
592            dist_up = [rel.distomin for rel in self.list_relations if
593                       not rel.parent_child]
594        if not dist_up or min(dist_up) == self.dist_root:
595            return self.dataset.root
596        dist_min = min(dist_up)
597        if distance:
598            list_dmin = [rel.relation[1] for rel in self.list_relations
599                         if rel.distance == dist_min]
600        else:
601            list_dmin = [rel.relation[1] for rel in self.list_relations
602                         if rel.distomin == dist_min]
603        max_lencodec = max(fld.lencodec for fld in list_dmin)
604        return [fld for fld in list_dmin if fld.lencodec == max_lencodec][0]
605
606    def to_dict(self, mode='id'):
607        '''return a dict with field attributes.
608
609         *Parameters*
610
611        - **mode** : str (default 'id') - AnaDfield representation ('field', 'id', 'index')
612        '''
613        dic = super().to_dict(full=True, idfield=False, notnone=False)
614        dic[DISTROOT] = self.dist_root
615        dic[NUM] = self.index
616        dic[CATEGORY] = self.category
617        dic[PDISTANCE] = self.p_distance.view(mode)
618        dic[PDISTOMIN] = self.p_distomin.view(mode)
619        dic[PDERIVED] = self.p_derived.view(mode)
620        return dic
621
622    def view(self, mode='field'):
623        ''' return a representation of the AnaDfield
624
625         *Parameters*
626
627        - **mode** : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
628        '''
629        return Util.view(self, mode)
630
631    def ascendants(self, typeparent='derived', mode='field'):
632        ''' return the list of the AnaDfield's ascendants in the family tree up to
633        the root AnaDfield.
634
635         *Parameters*
636
637        - **typeparent** : str (default 'derived') - 'derived', 'distance' or 'distomin'
638        - **mode** : str (default 'field') - AnaDfield representation
639        ('field', 'id', 'index')
640
641        *Returns* : list of parents from closest to the most distant. Parents
642        are represented with index, idfield, or object
643        '''
644        parent = self
645        listparent = []
646        while parent != self.dataset.root:
647            if typeparent == 'derived':
648                parent = parent.p_derived
649            elif typeparent == 'distance':
650                parent = parent.p_distance
651            else:
652                parent = parent.p_distomin
653            if parent != self.dataset.root:
654                listparent.append(parent)
655        return Util.view(listparent, mode)
656
657    def dic_inner_node(self, mode, lname):
658        '''return a child AnaDfield tree.
659
660         *Parameters*
661
662        - **lname** : integer - maximal length of the names
663        - **mode** : string (default 'derived') - kind of tree :
664            'derived' : derived tree
665            'distance': min distance tree
666            'distomin': min distomin tree
667
668        *Returns* : dict where key is a AnaDfield and value is the list of
669        the childs "name ( dist - lencodec)".
670        '''
671        adding = ''
672        if mode == 'distance':
673            rel_parent = self.dataset.get_relation(self, self.p_distance)
674            adding = str(rel_parent.distance) + ' - '
675        elif mode == 'distomin':
676            rel_parent = self.dataset.get_relation(self, self.p_distomin)
677            adding = str(rel_parent.distomin) + ' - '
678        elif mode == 'derived':
679            rel_parent = self.dataset.get_relation(self, self.p_derived)
680            adding = str(rel_parent.distance) + ' - '
681        adding += str(self.lencodec)
682        name = str(self.idfield)[:lname] + ' (' + adding + ')'
683        lis = [name.replace(' ', '*').replace("'", '*')]
684        if mode == 'derived':
685            childs = []
686            if not self.category in (ROOTED, COUPLED, UNIQUE):
687                for rel in self.list_coupled:
688                    lis.append(rel.relation[1].dic_inner_node(mode, lname))
689            if not self.category in (ROOTED, UNIQUE):
690                childs = [rel.relation[1] for rel in self.list_relations
691                          if rel.relation[1].p_derived == self and
692                          rel.relation[1].category != COUPLED]
693        if mode == 'distomin':
694            childs = [rel.relation[1] for rel in self.list_relations
695                      if rel.relation[1].p_distomin == self]
696        if mode == 'distance':
697            childs = [rel.relation[1] for rel in self.list_relations
698                      if rel.relation[1].p_distance == self]
699        for fld in childs:
700            lis.append(fld.dic_inner_node(mode, lname))
701        return {str(self.index).ljust(2, '*'): lis}

This class analyses structure and relationships of fields inside a dataset

Attributes :

  • dataset : AnaDataset object where AnaDfield is included
  • AnaField attributes : inheritance of AnaField object

relationship (@property)

field (@property)

global (@property)

global (instance methods)

other instance methods

AnaDfield(other, dataset)
483    def __init__(self, other, dataset):
484        '''AnaDfield is created by adding a AnaDataset link to an AnaField object.
485
486         *Parameters*
487
488        - **other** : AnaField or AnaDfield to initialize attributes
489        - **dataset** : AnaDataset which includes the AnaDfield
490        '''
491        self.dataset = dataset

AnaDfield is created by adding a AnaDataset link to an AnaField object.

Parameters

  • other : AnaField or AnaDfield to initialize attributes
  • dataset : AnaDataset which includes the AnaDfield
dataset
index
501    @property
502    def index(self):
503        '''return the row of the field in the AnaDataset'''
504        if self == self.dataset.root:
505            return -1
506        return self.dataset.fields.index(self)

return the row of the field in the AnaDataset

fields
508    @property
509    def fields(self):
510        '''return the list of the fields included in the AnaDataset'''
511        return self.dataset.fields

return the list of the fields included in the AnaDataset

list_relations
513    @property
514    def list_relations(self):
515        '''return the list of the relations with the AnaDfield'''
516        return list(self.dataset.relations[self].values())

return the list of the relations with the AnaDfield

list_p_derived
518    @property
519    def list_p_derived(self):
520        '''return the list of the derived relations with the parents of AnaDfield'''
521        return [rel for rel in self.list_relations if rel.typecoupl == DERIVED
522                and not rel.parent_child]

return the list of the derived relations with the parents of AnaDfield

list_c_derived
524    @property
525    def list_c_derived(self):
526        '''return the list of the derived relations with the childs of AnaDfield'''
527        return [rel for rel in self.list_relations if rel.typecoupl == DERIVED
528                and rel.parent_child
529                and rel.relation[1].category != UNIQUE]

return the list of the derived relations with the childs of AnaDfield

list_coupled
531    @property
532    def list_coupled(self):
533        '''return the list of the coupled relations with the AnaDfield'''
534        return [rel for rel in self.list_relations if rel.typecoupl == COUPLED]

return the list of the coupled relations with the AnaDfield

dist_root
536    @property
537    def dist_root(self):
538        '''return the distance to the root field'''
539        return len(self.dataset) - self.lencodec

return the distance to the root field

category
541    @property
542    def category(self):
543        '''return AnaDfield category (unique, rooted, coupled, derived, mixed)'''
544        if self.typecodec == UNIQUE:
545            return UNIQUE
546        if self.typecodec in (COMPLETE, FULL):
547            return ROOTED
548        if COUPLED in [rel.typecoupl for rel in self.list_relations
549                       if not rel.parent_child]:
550            return COUPLED
551        if not self.list_c_derived:
552            return DERIVED
553        return MIXED

return AnaDfield category (unique, rooted, coupled, derived, mixed)

p_derived
555    @property
556    def p_derived(self):
557        '''return the first derived or coupled parent of the AnaDfield'''
558        if self.category in (UNIQUE, ROOTED):
559            return self.dataset.root
560        if self.category == COUPLED:
561            return [rel.relation[1] for rel in self.list_coupled
562                    if not rel.relation[1].category == COUPLED][0]
563        if not self.list_p_derived:
564            return self.dataset.root
565        distance_min = min(rel.distance for rel in self.list_p_derived)
566        for rel in self.list_p_derived:
567            if rel.distance == distance_min:
568                if rel.relation[1].category == ROOTED:
569                    return self.dataset.root
570                if rel.relation[1].category == MIXED:
571                    return rel.relation[1]
572        return self.dataset.root

return the first derived or coupled parent of the AnaDfield

p_distance
574    @property
575    def p_distance(self):
576        '''return the first parent with minimal distance of the AnaDfield'''
577        return self._p_min_dist()

return the first parent with minimal distance of the AnaDfield

p_distomin
579    @property
580    def p_distomin(self):
581        '''return the first parent with minimal distomin of the AnaDfield'''
582        return self._p_min_dist(False)

return the first parent with minimal distomin of the AnaDfield

def to_dict(self, mode='id'):
606    def to_dict(self, mode='id'):
607        '''return a dict with field attributes.
608
609         *Parameters*
610
611        - **mode** : str (default 'id') - AnaDfield representation ('field', 'id', 'index')
612        '''
613        dic = super().to_dict(full=True, idfield=False, notnone=False)
614        dic[DISTROOT] = self.dist_root
615        dic[NUM] = self.index
616        dic[CATEGORY] = self.category
617        dic[PDISTANCE] = self.p_distance.view(mode)
618        dic[PDISTOMIN] = self.p_distomin.view(mode)
619        dic[PDERIVED] = self.p_derived.view(mode)
620        return dic

return a dict with field attributes.

Parameters

  • mode : str (default 'id') - AnaDfield representation ('field', 'id', 'index')
def view(self, mode='field'):
622    def view(self, mode='field'):
623        ''' return a representation of the AnaDfield
624
625         *Parameters*
626
627        - **mode** : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
628        '''
629        return Util.view(self, mode)

return a representation of the AnaDfield

Parameters

  • mode : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
def ascendants(self, typeparent='derived', mode='field'):
631    def ascendants(self, typeparent='derived', mode='field'):
632        ''' return the list of the AnaDfield's ascendants in the family tree up to
633        the root AnaDfield.
634
635         *Parameters*
636
637        - **typeparent** : str (default 'derived') - 'derived', 'distance' or 'distomin'
638        - **mode** : str (default 'field') - AnaDfield representation
639        ('field', 'id', 'index')
640
641        *Returns* : list of parents from closest to the most distant. Parents
642        are represented with index, idfield, or object
643        '''
644        parent = self
645        listparent = []
646        while parent != self.dataset.root:
647            if typeparent == 'derived':
648                parent = parent.p_derived
649            elif typeparent == 'distance':
650                parent = parent.p_distance
651            else:
652                parent = parent.p_distomin
653            if parent != self.dataset.root:
654                listparent.append(parent)
655        return Util.view(listparent, mode)

return the list of the AnaDfield's ascendants in the family tree up to the root AnaDfield.

Parameters

  • typeparent : str (default 'derived') - 'derived', 'distance' or 'distomin'
  • mode : str (default 'field') - AnaDfield representation ('field', 'id', 'index')

Returns : list of parents from closest to the most distant. Parents are represented with index, idfield, or object

def dic_inner_node(self, mode, lname):
657    def dic_inner_node(self, mode, lname):
658        '''return a child AnaDfield tree.
659
660         *Parameters*
661
662        - **lname** : integer - maximal length of the names
663        - **mode** : string (default 'derived') - kind of tree :
664            'derived' : derived tree
665            'distance': min distance tree
666            'distomin': min distomin tree
667
668        *Returns* : dict where key is a AnaDfield and value is the list of
669        the childs "name ( dist - lencodec)".
670        '''
671        adding = ''
672        if mode == 'distance':
673            rel_parent = self.dataset.get_relation(self, self.p_distance)
674            adding = str(rel_parent.distance) + ' - '
675        elif mode == 'distomin':
676            rel_parent = self.dataset.get_relation(self, self.p_distomin)
677            adding = str(rel_parent.distomin) + ' - '
678        elif mode == 'derived':
679            rel_parent = self.dataset.get_relation(self, self.p_derived)
680            adding = str(rel_parent.distance) + ' - '
681        adding += str(self.lencodec)
682        name = str(self.idfield)[:lname] + ' (' + adding + ')'
683        lis = [name.replace(' ', '*').replace("'", '*')]
684        if mode == 'derived':
685            childs = []
686            if not self.category in (ROOTED, COUPLED, UNIQUE):
687                for rel in self.list_coupled:
688                    lis.append(rel.relation[1].dic_inner_node(mode, lname))
689            if not self.category in (ROOTED, UNIQUE):
690                childs = [rel.relation[1] for rel in self.list_relations
691                          if rel.relation[1].p_derived == self and
692                          rel.relation[1].category != COUPLED]
693        if mode == 'distomin':
694            childs = [rel.relation[1] for rel in self.list_relations
695                      if rel.relation[1].p_distomin == self]
696        if mode == 'distance':
697            childs = [rel.relation[1] for rel in self.list_relations
698                      if rel.relation[1].p_distance == self]
699        for fld in childs:
700            lis.append(fld.dic_inner_node(mode, lname))
701        return {str(self.index).ljust(2, '*'): lis}

return a child AnaDfield tree.

Parameters

  • lname : integer - maximal length of the names
  • mode : string (default 'derived') - kind of tree : 'derived' : derived tree 'distance': min distance tree 'distomin': min distomin tree

Returns : dict where key is a AnaDfield and value is the list of the childs "name ( dist - lencodec)".

class AnaDataset:
 704class AnaDataset:
 705    '''This class analyses the structure of a dataset.
 706
 707    *Attributes* :
 708
 709    - **iddataset** : string or integer - Id of the Dataset
 710    - **fields** : list of the AnaDfields included
 711    - **relations** : dict of the AnaRelations between two AnaDfields
 712    - **hashd** : string - update identifier
 713
 714    *relationship (@property)*
 715
 716    - `ana_relations`
 717    - `p_relations`
 718
 719    *field (@property)*
 720
 721    - `root`
 722    - `primary`
 723    - `secondary`
 724    - `unique`
 725    - `mixte`
 726    - `variable`
 727
 728    *global (@property)*
 729
 730    - `category`
 731    - `complete`
 732    - `dimension`
 733
 734    *update (instance methods)*
 735
 736    - `set_relations`
 737
 738    *access (instance methods)*
 739
 740    - `get_relation`
 741    - `dfield`
 742
 743    *synthesis (instance methods)*
 744
 745    - `tree`
 746    - `to_dict`
 747    - `indicator`
 748    - `partitions`
 749    - `field_partition`
 750    - `relation_partition`
 751    '''
 752
 753    def __init__(self, fields=None, relations=None, iddataset=None,
 754                 leng=None, hashd=None):
 755        '''Creation mode :
 756        - single dict attribute where keys are attributes name,
 757        - single AnaDataset attribute to make a copy
 758        - multiple attributes
 759
 760        *Parameters (single dict)*
 761
 762        - **fields**: {'fields': list_of_dict, 'name': id_dataset,
 763                       'length': length, 'relations': dict_of_relations
 764            where:
 765                list_of_dict : {'id': id_field, 'lencodec': len_codec, 'mincodec': min_codec}
 766                id_field: string - name of field
 767                other_field: string - name of field
 768                len_codec: int - length of the codec
 769                min_codec: int - number of different codec values
 770                id_dataset : name of the dataset
 771                length: int - length of the dataset
 772                dict_of_relations: {id_field : {other_field: dist} for all fields}
 773                field: name of a field
 774                field_other: name of another field
 775                dist: integer (distance between the two fields) or
 776                array (distance and boolean distributed)
 777
 778        *Parameters (multiple attributes)*
 779
 780        - **fields**: list_of_dict
 781        - **iddataset** : string (default None) - id_dataset
 782        - **relations** : dict (default None) - dict_of_relations
 783        - **leng** : int (default None) - length
 784        - **hashd** : string (default None) - update identifier
 785        '''
 786        if isinstance(fields, AnaDataset):
 787            self.iddataset = fields.iddataset
 788            self.fields = fields.fields
 789            self.relations = fields.relations
 790            self.hashd = fields.hashd
 791            return
 792        if isinstance(fields, dict):
 793            iddataset = fields.get(IDDATASET, None)
 794            leng = fields.get(LENGTH, None)
 795            relations = fields.get(RELATIONS, None)
 796            hashd = fields.get(HASHD)
 797            fields = fields.get(FIELDS, None)
 798        self.iddataset = iddataset
 799        self.fields = [AnaDfield(AnaField(field), self)
 800                       for field in fields] if fields else []
 801        if leng:
 802            for fld in self.fields:
 803                fld.maxcodec = leng
 804        self.relations = {field: {} for field in self.fields}
 805        if relations:
 806            for fld, dic_relation in relations.items():
 807                self.set_relations(fld, dic_relation)
 808        self.hashd = hashd
 809
 810    def __len__(self):
 811        '''length of the AnaDataset (len of the AnaDfields included)'''
 812        return max(len(fld) for fld in self.fields)
 813
 814    def __eq__(self, other):
 815        ''' equal if class and values are equal'''
 816        return self.__class__ .__name__ == other.__class__.__name__ and \
 817            self.fields == other.fields and self.relations == other.relations and \
 818            self.iddataset == other.iddataset and self.hashd == other.hashd
 819
 820    def __hash__(self):
 821        '''return hash value (sum of attributes hash)'''
 822        return hash(self.iddataset) + sum(hash(fld) for fld in self.fields) + \
 823            sum(hash(rel) for rel in self.relations) + hash(self.hashd)
 824
 825    @property
 826    def category(self):
 827        '''return a list of AnaDfield category (unique, rooted, coupled, derived, mixed)'''
 828        return [fld.category for fld in self.fields]
 829
 830    @property
 831    def ana_relations(self):
 832        '''return the list of AnaRelation included'''
 833        return [rel for fldrel in self.relations.values() for rel in fldrel.values()]
 834
 835    @property
 836    def p_relations(self):
 837        '''return the list of oriented AnaRelation (parent first, child second)'''
 838        return [rel for rel in self.ana_relations if rel.parent_child]
 839
 840    @property
 841    def root(self):
 842        '''return the root AnaDfield'''
 843        len_self = len(self)
 844        return AnaDfield(AnaField(ROOT, len_self, len_self, len_self), self)
 845
 846    @property
 847    def primary(self):
 848        '''return the first partition of the partitions'''
 849        return self.field_partition(mode='field')['primary']
 850        # part = self.partitions(mode='field', distributed=True)
 851        # return part[0] if part else []
 852
 853    @property
 854    def complete(self):
 855        '''return True if the dimension is not 0'''
 856        return self.dimension > 0
 857
 858    @property
 859    def dimension(self):
 860        '''return the highest partition lenght'''
 861        return len(self.primary)
 862
 863    @property
 864    def secondary(self):
 865        '''return the derived ou coupled fields from primary'''
 866        return self.field_partition(mode='field')['secondary']
 867
 868    @property
 869    def unique(self):
 870        '''return the unique fields'''
 871        return [fld for fld in self.fields if fld.category == UNIQUE]
 872
 873    @property
 874    def variable(self):
 875        '''return the variable fields'''
 876        return self.field_partition(mode='field')['variable']
 877
 878    @property
 879    def mixte(self):
 880        '''return the variable fields'''
 881        return self.field_partition(mode='field')['mixte']
 882
 883    def set_relations(self, field, dic_relations):
 884        '''Add relations in the AnaDataset from a dict.
 885
 886         *Parameters*
 887
 888        - **field** : AnaDfield, AnaField or str (idfield) - first relation AnaDfield
 889        - **dic_relations** : dict - key is the second relation AnaDfield and
 890        value is the dist value or teh list [dist, distrib]
 891        '''
 892        fld = self.dfield(field)
 893        for other, dist in dic_relations.items():
 894            oth = self.dfield(other)
 895            self.relations[fld][oth] = AnaRelation([fld, oth], dist)
 896            self.relations[oth][fld] = AnaRelation([oth, fld], dist)
 897
 898    def get_relation(self, fld1, fld2):
 899        '''Return AnaRelation between fld1 and fld2.
 900
 901         *Parameters*
 902
 903        - **fld1** : AnaDfield, AnaField, int or str (idfield) - first relation AnaDfield
 904        - **fld2** : AnaDfield, AnaField, int or str (idfield) - second relation AnaDfield
 905        '''
 906        fl1 = self.dfield(fld1)
 907        fl2 = self.dfield(fld2)
 908        if self.root in [fl1, fl2]:
 909            return AnaRelation([fl1, fl2], len(self))
 910        return self.relations[self.dfield(fld1)][self.dfield(fld2)]
 911
 912    def dfield(self, fld):
 913        '''return the AnaDfield matching with fld. Fld is str, int, AnaDfield or AnaField'''
 914        if fld in (-1, ROOT):
 915            return self.root
 916        if isinstance(fld, AnaDfield):
 917            return fld
 918        if isinstance(fld, int):
 919            return self.fields[fld]
 920        if isinstance(fld, str):
 921            if fld in [dfld.idfield for dfld in self.fields]:
 922                return [dfld for dfld in self.fields if dfld.idfield == fld][0]
 923            return None
 924        return AnaDfield(fld, self)
 925
 926    def tree(self, mode='derived', width=5, lname=20, string=True):
 927        '''return a string with a tree of derived Field.
 928
 929         *Parameters*
 930
 931        - **lname** : integer (default 20) - length of the names
 932        - **width** : integer (default 5) - length of the lines
 933        - **string** : boolean (default True) - if True return str else return dict
 934        - **mode** : string (default 'derived') - kind of tree :
 935            'derived' : derived tree
 936            'distance': min distance tree
 937            'distomin': min distomin tree
 938        '''
 939        lis = ['root-' + mode + '*(' + str(len(self)) + ')']
 940        if mode == 'distance':
 941            childs = [fld for fld in self.fields if fld.p_distance == self.root]
 942        elif mode == 'distomin':
 943            childs = [fld for fld in self.fields if fld.p_distomin == self.root]
 944        elif mode == 'derived':
 945            childs = [fld for fld in self.fields if fld.p_derived == self.root]
 946        for fld in childs:
 947            lis.append(fld.dic_inner_node(mode, lname))
 948        tree = {str(-1).ljust(2, '*'): lis}
 949        if string:
 950            tre = pprint.pformat(tree, indent=0, width=width)
 951            tre = tre.replace('---', ' - ')
 952            tre = tre.replace('  ', ' ')
 953            tre = tre.replace('*', ' ')
 954            for car in ["'", "\"", "{", "[", "]", "}", ","]:
 955                tre = tre.replace(car, "")
 956            return tre
 957        return Util.clean_dic(tree, '*', ' ')
 958
 959    def to_dict(self, mode='field', keys=None, relations=False):
 960        '''return a dict with fields attributes and optionaly relations attributes.
 961
 962         *Parameters*
 963
 964        - **mode** : str (default 'field') - AnaDfield representation
 965        ('field', 'id', 'index')
 966        - **relations** : boolean (default: False) - if False return a list of fields,
 967        if True return a dict '{"fields": <list of fields>, "relations": <list of relations>}'
 968        - **keys** : string, list or tuple - list of keys or single key to return
 969        if 'all' or None, all keys are returned
 970        if list, only keys in list are returned
 971        if string, only values associated to the string(key) are returned'''
 972        fields = Util.filter_dic([fld.to_dict(mode=mode)
 973                                 for fld in self.fields], keys)
 974        leng = len(self.fields)
 975        if not relations:
 976            return fields
 977        return {'fields': fields, 'relations':
 978                [self.get_relation(i, j).to_dict(full=True, mode=mode)
 979                 for i in range(-1, leng) for j in range(i + 1, leng)]}
 980
 981    def partitions(self, mode='id', distributed=True):
 982        '''return a list of available partitions (the first is highest).
 983
 984         *Parameters*
 985
 986        - **mode** : str (default 'id') - AnaDfield representation
 987        ('field', 'id', 'index')
 988        - **distributed** : boolean (default True) - Include only distributed fields
 989        '''
 990        partit = [[fld] for fld in self.fields if fld.category == ROOTED]
 991        crossed = [rel for rel in self.ana_relations if rel.typecoupl == CROSSED
 992                   and rel.parent_child
 993                   and rel.relation[0].category != COUPLED
 994                   and rel.relation[1].category != COUPLED]
 995        if distributed:
 996            crossed = [rel for rel in crossed if rel.distrib]
 997        if crossed and len(crossed) == 1 and crossed[0].dist == len(self):
 998            partit.insert(0, crossed[0].relation)
 999        elif crossed:
1000            for repeat in list(range(len(crossed))):
1001                candidates = combinations(crossed, repeat + 1)
1002                for candidat in candidates:
1003                    flds = list(set(rel.relation[i]
1004                                for rel in candidat for i in [0, 1]))
1005                    if (reduce(mul, [fld.lencodec for fld in flds]) == len(self) and
1006                        len(candidat) == sum(range(len(flds))) and
1007                            (not distributed or min(rel.distrib for rel in candidat))):
1008                        partit.insert(0, flds)
1009        partit = [list(tup) for tup in
1010                  sorted(sorted(list({tuple(sorted(prt)) for prt in partit})),
1011                         key=len, reverse=True)]
1012        return Util.view(partit, mode)
1013
1014    def field_partition(self, mode='id', partition=None, distributed=True):
1015        '''return a partition dict with the list of primary, secondary, unique
1016        and variable fields.
1017
1018        *Parameters*
1019
1020        - **mode** : str (default 'id') - AnaDfield representation
1021        ('field', 'id', 'index')
1022        - **partition** : list of str, int, AnaDfield or AnaField(default None) -
1023        if None, partition is the first
1024        - **distributed** : boolean (default True) - Include only distributed fields
1025        '''
1026        partitions = self.partitions(mode='field', distributed=distributed)
1027        if not partitions:
1028            return Util.view(
1029                {'primary': [], 'secondary': [
1030                    fld for fld in self.fields if fld.category != UNIQUE],
1031                 'mixte': [], 'unique': [
1032                    fld for fld in self.fields if fld.category == UNIQUE],
1033                 'variable': []}, mode)
1034        if not partition:
1035            partition = partitions[0]
1036        else:
1037            # partition = [self.dfield(fld) for fld in tuple(sorted(partition))]
1038            partition = [self.dfield(fld) for fld in tuple(partition)]
1039        secondary = []
1040        for field in partition:
1041            self._add_child(field, secondary)
1042        secondary = [fld for fld in secondary if not fld in partition]
1043        unique = [fld for fld in self.fields if fld.category == UNIQUE]
1044        mixte = list(self._mixte_dims(partition, partitions))
1045        variable = [fld for fld in self.fields
1046                    if not fld in partition + secondary + unique + mixte]
1047        return Util.view({'primary': partition, 'secondary': secondary,
1048                          'mixte': mixte, 'unique': unique,
1049                          'variable': variable}, mode)
1050
1051    def relation_partition(self, partition=None, primary=False, noroot=False):
1052        '''return a dict with the list of relationships for fields in a partition.
1053
1054        *Parameters*
1055
1056        - **partition** : list (default None) - if None, partition is the first
1057        - **primary** : boolean (default False) - if True, relations are primary fields
1058        - **noroot** : boolean (default False) - if True and single primary,
1059        'root' field is replaced by the primary field'''
1060        partitions = self.partitions(mode='field')
1061        if not partitions:
1062            partition = None
1063        else:
1064            partition = Util.view(partition, mode='field',
1065                                  ana=self) if partition else partitions[0]
1066        part = self.field_partition(
1067            mode='field', partition=partition, distributed=True)
1068        fields_cat = {fld: cat for cat, l_fld in part.items() for fld in l_fld}
1069        relations = {}
1070        for field in fields_cat:
1071            rel = []
1072            match fields_cat[field]:
1073                case 'primary':
1074                    rel = [field.idfield]
1075                case 'unique': ...
1076                case 'variable':
1077                    rel = [fld.idfield for fld in part['primary']]
1078                case 'secondary' if not primary:
1079                    rel = [field.p_derived.idfield]
1080                case 'secondary' if primary:
1081                    rel = [fld.idfield for fld in field.ascendants()
1082                           if fld in part['primary']]
1083                case 'mixte':
1084                    rel = [fld.idfield for fld in self._mixte_dims(
1085                        partition, partitions)[field]]
1086                case _: ...
1087            if rel == ['root'] and len(part['primary']) == 1 and noroot:
1088                rel = [part['primary'][0].idfield]
1089            if rel == ['root'] and len(part['primary']) == 0 and noroot:
1090                rel = [part['secondary'][0].idfield]
1091            relations[field.idfield] = rel
1092        return relations
1093
1094    def indicator(self, fullsize, size):
1095        '''generate size indicators: ol (object lightness), ul (unicity level),
1096        gain (sizegain)
1097
1098        *Parameters*
1099
1100        - **fullsize** : int - size with full codec
1101        - **size** : int - size with existing codec
1102
1103        *Returns* : dict'''
1104        lenindex = len(self.fields)
1105        indexlen = sum(fld.lencodec for fld in self.fields)
1106        nval = len(self) * (lenindex + 1)
1107        sval = fullsize / nval
1108        ncod = indexlen + lenindex
1109
1110        if nval != ncod:
1111            scod = (size - ncod * sval) / (nval - ncod)
1112            olight = scod / sval
1113        else:
1114            olight = None
1115        return {'total values': nval, 'mean size': round(sval, 3),
1116                'unique values': ncod, 'mean coding size': round(scod, 3),
1117                'unicity level': round(ncod / nval, 3),
1118                'optimize level': round(size / fullsize, 3),
1119                'object lightness': round(olight, 3),
1120                'maxgain': round((nval - ncod) / nval, 3),
1121                'gain': round((fullsize - size) / fullsize, 3)}
1122
1123    def _add_child(self, field, childs):
1124        ''' add derived or coupled fields in the childs list'''
1125        for rel in field.list_c_derived + field.list_coupled:
1126            child = rel.relation[1]
1127            if not child in childs and not child.category == UNIQUE:
1128                childs.append(child)
1129                if not child.category in (COUPLED, UNIQUE):
1130                    self._add_child(child, childs)
1131
1132    def _mixte_dims(self, partition, partitions):
1133        '''return dict with dimensions associated to each mixte field'''
1134        dic_mixte = {}
1135        for part in partitions:
1136            not_part = [fld for fld in part if not fld in partition]
1137            if len(not_part) == 1 and len(partition) > len(part) > 1:
1138                sub_part = [fld for fld in partition if not fld in part]
1139                if min(self.get_relation(not_part[0], fld).typecoupl == 'derived'
1140                       for fld in sub_part) is True:
1141                    dic_mixte[not_part[0]] = sub_part
1142        return dic_mixte

This class analyses the structure of a dataset.

Attributes :

  • iddataset : string or integer - Id of the Dataset
  • fields : list of the AnaDfields included
  • relations : dict of the AnaRelations between two AnaDfields
  • hashd : string - update identifier

relationship (@property)

field (@property)

global (@property)

update (instance methods)

access (instance methods)

synthesis (instance methods)

AnaDataset(fields=None, relations=None, iddataset=None, leng=None, hashd=None)
753    def __init__(self, fields=None, relations=None, iddataset=None,
754                 leng=None, hashd=None):
755        '''Creation mode :
756        - single dict attribute where keys are attributes name,
757        - single AnaDataset attribute to make a copy
758        - multiple attributes
759
760        *Parameters (single dict)*
761
762        - **fields**: {'fields': list_of_dict, 'name': id_dataset,
763                       'length': length, 'relations': dict_of_relations
764            where:
765                list_of_dict : {'id': id_field, 'lencodec': len_codec, 'mincodec': min_codec}
766                id_field: string - name of field
767                other_field: string - name of field
768                len_codec: int - length of the codec
769                min_codec: int - number of different codec values
770                id_dataset : name of the dataset
771                length: int - length of the dataset
772                dict_of_relations: {id_field : {other_field: dist} for all fields}
773                field: name of a field
774                field_other: name of another field
775                dist: integer (distance between the two fields) or
776                array (distance and boolean distributed)
777
778        *Parameters (multiple attributes)*
779
780        - **fields**: list_of_dict
781        - **iddataset** : string (default None) - id_dataset
782        - **relations** : dict (default None) - dict_of_relations
783        - **leng** : int (default None) - length
784        - **hashd** : string (default None) - update identifier
785        '''
786        if isinstance(fields, AnaDataset):
787            self.iddataset = fields.iddataset
788            self.fields = fields.fields
789            self.relations = fields.relations
790            self.hashd = fields.hashd
791            return
792        if isinstance(fields, dict):
793            iddataset = fields.get(IDDATASET, None)
794            leng = fields.get(LENGTH, None)
795            relations = fields.get(RELATIONS, None)
796            hashd = fields.get(HASHD)
797            fields = fields.get(FIELDS, None)
798        self.iddataset = iddataset
799        self.fields = [AnaDfield(AnaField(field), self)
800                       for field in fields] if fields else []
801        if leng:
802            for fld in self.fields:
803                fld.maxcodec = leng
804        self.relations = {field: {} for field in self.fields}
805        if relations:
806            for fld, dic_relation in relations.items():
807                self.set_relations(fld, dic_relation)
808        self.hashd = hashd

Creation mode :

  • single dict attribute where keys are attributes name,
  • single AnaDataset attribute to make a copy
  • multiple attributes

Parameters (single dict)

  • fields: {'fields': list_of_dict, 'name': id_dataset, 'length': length, 'relations': dict_of_relations where: list_of_dict : {'id': id_field, 'lencodec': len_codec, 'mincodec': min_codec} id_field: string - name of field other_field: string - name of field len_codec: int - length of the codec min_codec: int - number of different codec values id_dataset : name of the dataset length: int - length of the dataset dict_of_relations: {id_field : {other_field: dist} for all fields} field: name of a field field_other: name of another field dist: integer (distance between the two fields) or array (distance and boolean distributed)

Parameters (multiple attributes)

  • fields: list_of_dict
  • iddataset : string (default None) - id_dataset
  • relations : dict (default None) - dict_of_relations
  • leng : int (default None) - length
  • hashd : string (default None) - update identifier
iddataset
fields
relations
hashd
category
825    @property
826    def category(self):
827        '''return a list of AnaDfield category (unique, rooted, coupled, derived, mixed)'''
828        return [fld.category for fld in self.fields]

return a list of AnaDfield category (unique, rooted, coupled, derived, mixed)

ana_relations
830    @property
831    def ana_relations(self):
832        '''return the list of AnaRelation included'''
833        return [rel for fldrel in self.relations.values() for rel in fldrel.values()]

return the list of AnaRelation included

p_relations
835    @property
836    def p_relations(self):
837        '''return the list of oriented AnaRelation (parent first, child second)'''
838        return [rel for rel in self.ana_relations if rel.parent_child]

return the list of oriented AnaRelation (parent first, child second)

root
840    @property
841    def root(self):
842        '''return the root AnaDfield'''
843        len_self = len(self)
844        return AnaDfield(AnaField(ROOT, len_self, len_self, len_self), self)

return the root AnaDfield

primary
846    @property
847    def primary(self):
848        '''return the first partition of the partitions'''
849        return self.field_partition(mode='field')['primary']
850        # part = self.partitions(mode='field', distributed=True)
851        # return part[0] if part else []

return the first partition of the partitions

complete
853    @property
854    def complete(self):
855        '''return True if the dimension is not 0'''
856        return self.dimension > 0

return True if the dimension is not 0

dimension
858    @property
859    def dimension(self):
860        '''return the highest partition lenght'''
861        return len(self.primary)

return the highest partition lenght

secondary
863    @property
864    def secondary(self):
865        '''return the derived ou coupled fields from primary'''
866        return self.field_partition(mode='field')['secondary']

return the derived ou coupled fields from primary

unique
868    @property
869    def unique(self):
870        '''return the unique fields'''
871        return [fld for fld in self.fields if fld.category == UNIQUE]

return the unique fields

variable
873    @property
874    def variable(self):
875        '''return the variable fields'''
876        return self.field_partition(mode='field')['variable']

return the variable fields

mixte
878    @property
879    def mixte(self):
880        '''return the variable fields'''
881        return self.field_partition(mode='field')['mixte']

return the variable fields

def set_relations(self, field, dic_relations):
883    def set_relations(self, field, dic_relations):
884        '''Add relations in the AnaDataset from a dict.
885
886         *Parameters*
887
888        - **field** : AnaDfield, AnaField or str (idfield) - first relation AnaDfield
889        - **dic_relations** : dict - key is the second relation AnaDfield and
890        value is the dist value or teh list [dist, distrib]
891        '''
892        fld = self.dfield(field)
893        for other, dist in dic_relations.items():
894            oth = self.dfield(other)
895            self.relations[fld][oth] = AnaRelation([fld, oth], dist)
896            self.relations[oth][fld] = AnaRelation([oth, fld], dist)

Add relations in the AnaDataset from a dict.

Parameters

  • field : AnaDfield, AnaField or str (idfield) - first relation AnaDfield
  • dic_relations : dict - key is the second relation AnaDfield and value is the dist value or teh list [dist, distrib]
def get_relation(self, fld1, fld2):
898    def get_relation(self, fld1, fld2):
899        '''Return AnaRelation between fld1 and fld2.
900
901         *Parameters*
902
903        - **fld1** : AnaDfield, AnaField, int or str (idfield) - first relation AnaDfield
904        - **fld2** : AnaDfield, AnaField, int or str (idfield) - second relation AnaDfield
905        '''
906        fl1 = self.dfield(fld1)
907        fl2 = self.dfield(fld2)
908        if self.root in [fl1, fl2]:
909            return AnaRelation([fl1, fl2], len(self))
910        return self.relations[self.dfield(fld1)][self.dfield(fld2)]

Return AnaRelation between fld1 and fld2.

Parameters

  • fld1 : AnaDfield, AnaField, int or str (idfield) - first relation AnaDfield
  • fld2 : AnaDfield, AnaField, int or str (idfield) - second relation AnaDfield
def dfield(self, fld):
912    def dfield(self, fld):
913        '''return the AnaDfield matching with fld. Fld is str, int, AnaDfield or AnaField'''
914        if fld in (-1, ROOT):
915            return self.root
916        if isinstance(fld, AnaDfield):
917            return fld
918        if isinstance(fld, int):
919            return self.fields[fld]
920        if isinstance(fld, str):
921            if fld in [dfld.idfield for dfld in self.fields]:
922                return [dfld for dfld in self.fields if dfld.idfield == fld][0]
923            return None
924        return AnaDfield(fld, self)

return the AnaDfield matching with fld. Fld is str, int, AnaDfield or AnaField

def tree(self, mode='derived', width=5, lname=20, string=True):
926    def tree(self, mode='derived', width=5, lname=20, string=True):
927        '''return a string with a tree of derived Field.
928
929         *Parameters*
930
931        - **lname** : integer (default 20) - length of the names
932        - **width** : integer (default 5) - length of the lines
933        - **string** : boolean (default True) - if True return str else return dict
934        - **mode** : string (default 'derived') - kind of tree :
935            'derived' : derived tree
936            'distance': min distance tree
937            'distomin': min distomin tree
938        '''
939        lis = ['root-' + mode + '*(' + str(len(self)) + ')']
940        if mode == 'distance':
941            childs = [fld for fld in self.fields if fld.p_distance == self.root]
942        elif mode == 'distomin':
943            childs = [fld for fld in self.fields if fld.p_distomin == self.root]
944        elif mode == 'derived':
945            childs = [fld for fld in self.fields if fld.p_derived == self.root]
946        for fld in childs:
947            lis.append(fld.dic_inner_node(mode, lname))
948        tree = {str(-1).ljust(2, '*'): lis}
949        if string:
950            tre = pprint.pformat(tree, indent=0, width=width)
951            tre = tre.replace('---', ' - ')
952            tre = tre.replace('  ', ' ')
953            tre = tre.replace('*', ' ')
954            for car in ["'", "\"", "{", "[", "]", "}", ","]:
955                tre = tre.replace(car, "")
956            return tre
957        return Util.clean_dic(tree, '*', ' ')

return a string with a tree of derived Field.

Parameters

  • lname : integer (default 20) - length of the names
  • width : integer (default 5) - length of the lines
  • string : boolean (default True) - if True return str else return dict
  • mode : string (default 'derived') - kind of tree : 'derived' : derived tree 'distance': min distance tree 'distomin': min distomin tree
def to_dict(self, mode='field', keys=None, relations=False):
959    def to_dict(self, mode='field', keys=None, relations=False):
960        '''return a dict with fields attributes and optionaly relations attributes.
961
962         *Parameters*
963
964        - **mode** : str (default 'field') - AnaDfield representation
965        ('field', 'id', 'index')
966        - **relations** : boolean (default: False) - if False return a list of fields,
967        if True return a dict '{"fields": <list of fields>, "relations": <list of relations>}'
968        - **keys** : string, list or tuple - list of keys or single key to return
969        if 'all' or None, all keys are returned
970        if list, only keys in list are returned
971        if string, only values associated to the string(key) are returned'''
972        fields = Util.filter_dic([fld.to_dict(mode=mode)
973                                 for fld in self.fields], keys)
974        leng = len(self.fields)
975        if not relations:
976            return fields
977        return {'fields': fields, 'relations':
978                [self.get_relation(i, j).to_dict(full=True, mode=mode)
979                 for i in range(-1, leng) for j in range(i + 1, leng)]}

return a dict with fields attributes and optionaly relations attributes.

Parameters

  • mode : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
  • relations : boolean (default: False) - if False return a list of fields, if True return a dict '{"fields": , "relations": }'
  • keys : string, list or tuple - list of keys or single key to return if 'all' or None, all keys are returned if list, only keys in list are returned if string, only values associated to the string(key) are returned
def partitions(self, mode='id', distributed=True):
 981    def partitions(self, mode='id', distributed=True):
 982        '''return a list of available partitions (the first is highest).
 983
 984         *Parameters*
 985
 986        - **mode** : str (default 'id') - AnaDfield representation
 987        ('field', 'id', 'index')
 988        - **distributed** : boolean (default True) - Include only distributed fields
 989        '''
 990        partit = [[fld] for fld in self.fields if fld.category == ROOTED]
 991        crossed = [rel for rel in self.ana_relations if rel.typecoupl == CROSSED
 992                   and rel.parent_child
 993                   and rel.relation[0].category != COUPLED
 994                   and rel.relation[1].category != COUPLED]
 995        if distributed:
 996            crossed = [rel for rel in crossed if rel.distrib]
 997        if crossed and len(crossed) == 1 and crossed[0].dist == len(self):
 998            partit.insert(0, crossed[0].relation)
 999        elif crossed:
1000            for repeat in list(range(len(crossed))):
1001                candidates = combinations(crossed, repeat + 1)
1002                for candidat in candidates:
1003                    flds = list(set(rel.relation[i]
1004                                for rel in candidat for i in [0, 1]))
1005                    if (reduce(mul, [fld.lencodec for fld in flds]) == len(self) and
1006                        len(candidat) == sum(range(len(flds))) and
1007                            (not distributed or min(rel.distrib for rel in candidat))):
1008                        partit.insert(0, flds)
1009        partit = [list(tup) for tup in
1010                  sorted(sorted(list({tuple(sorted(prt)) for prt in partit})),
1011                         key=len, reverse=True)]
1012        return Util.view(partit, mode)

return a list of available partitions (the first is highest).

Parameters

  • mode : str (default 'id') - AnaDfield representation ('field', 'id', 'index')
  • distributed : boolean (default True) - Include only distributed fields
def field_partition(self, mode='id', partition=None, distributed=True):
1014    def field_partition(self, mode='id', partition=None, distributed=True):
1015        '''return a partition dict with the list of primary, secondary, unique
1016        and variable fields.
1017
1018        *Parameters*
1019
1020        - **mode** : str (default 'id') - AnaDfield representation
1021        ('field', 'id', 'index')
1022        - **partition** : list of str, int, AnaDfield or AnaField(default None) -
1023        if None, partition is the first
1024        - **distributed** : boolean (default True) - Include only distributed fields
1025        '''
1026        partitions = self.partitions(mode='field', distributed=distributed)
1027        if not partitions:
1028            return Util.view(
1029                {'primary': [], 'secondary': [
1030                    fld for fld in self.fields if fld.category != UNIQUE],
1031                 'mixte': [], 'unique': [
1032                    fld for fld in self.fields if fld.category == UNIQUE],
1033                 'variable': []}, mode)
1034        if not partition:
1035            partition = partitions[0]
1036        else:
1037            # partition = [self.dfield(fld) for fld in tuple(sorted(partition))]
1038            partition = [self.dfield(fld) for fld in tuple(partition)]
1039        secondary = []
1040        for field in partition:
1041            self._add_child(field, secondary)
1042        secondary = [fld for fld in secondary if not fld in partition]
1043        unique = [fld for fld in self.fields if fld.category == UNIQUE]
1044        mixte = list(self._mixte_dims(partition, partitions))
1045        variable = [fld for fld in self.fields
1046                    if not fld in partition + secondary + unique + mixte]
1047        return Util.view({'primary': partition, 'secondary': secondary,
1048                          'mixte': mixte, 'unique': unique,
1049                          'variable': variable}, mode)

return a partition dict with the list of primary, secondary, unique and variable fields.

Parameters

  • mode : str (default 'id') - AnaDfield representation ('field', 'id', 'index')
  • partition : list of str, int, AnaDfield or AnaField(default None) - if None, partition is the first
  • distributed : boolean (default True) - Include only distributed fields
def relation_partition(self, partition=None, primary=False, noroot=False):
1051    def relation_partition(self, partition=None, primary=False, noroot=False):
1052        '''return a dict with the list of relationships for fields in a partition.
1053
1054        *Parameters*
1055
1056        - **partition** : list (default None) - if None, partition is the first
1057        - **primary** : boolean (default False) - if True, relations are primary fields
1058        - **noroot** : boolean (default False) - if True and single primary,
1059        'root' field is replaced by the primary field'''
1060        partitions = self.partitions(mode='field')
1061        if not partitions:
1062            partition = None
1063        else:
1064            partition = Util.view(partition, mode='field',
1065                                  ana=self) if partition else partitions[0]
1066        part = self.field_partition(
1067            mode='field', partition=partition, distributed=True)
1068        fields_cat = {fld: cat for cat, l_fld in part.items() for fld in l_fld}
1069        relations = {}
1070        for field in fields_cat:
1071            rel = []
1072            match fields_cat[field]:
1073                case 'primary':
1074                    rel = [field.idfield]
1075                case 'unique': ...
1076                case 'variable':
1077                    rel = [fld.idfield for fld in part['primary']]
1078                case 'secondary' if not primary:
1079                    rel = [field.p_derived.idfield]
1080                case 'secondary' if primary:
1081                    rel = [fld.idfield for fld in field.ascendants()
1082                           if fld in part['primary']]
1083                case 'mixte':
1084                    rel = [fld.idfield for fld in self._mixte_dims(
1085                        partition, partitions)[field]]
1086                case _: ...
1087            if rel == ['root'] and len(part['primary']) == 1 and noroot:
1088                rel = [part['primary'][0].idfield]
1089            if rel == ['root'] and len(part['primary']) == 0 and noroot:
1090                rel = [part['secondary'][0].idfield]
1091            relations[field.idfield] = rel
1092        return relations

return a dict with the list of relationships for fields in a partition.

Parameters

  • partition : list (default None) - if None, partition is the first
  • primary : boolean (default False) - if True, relations are primary fields
  • noroot : boolean (default False) - if True and single primary, 'root' field is replaced by the primary field
def indicator(self, fullsize, size):
1094    def indicator(self, fullsize, size):
1095        '''generate size indicators: ol (object lightness), ul (unicity level),
1096        gain (sizegain)
1097
1098        *Parameters*
1099
1100        - **fullsize** : int - size with full codec
1101        - **size** : int - size with existing codec
1102
1103        *Returns* : dict'''
1104        lenindex = len(self.fields)
1105        indexlen = sum(fld.lencodec for fld in self.fields)
1106        nval = len(self) * (lenindex + 1)
1107        sval = fullsize / nval
1108        ncod = indexlen + lenindex
1109
1110        if nval != ncod:
1111            scod = (size - ncod * sval) / (nval - ncod)
1112            olight = scod / sval
1113        else:
1114            olight = None
1115        return {'total values': nval, 'mean size': round(sval, 3),
1116                'unique values': ncod, 'mean coding size': round(scod, 3),
1117                'unicity level': round(ncod / nval, 3),
1118                'optimize level': round(size / fullsize, 3),
1119                'object lightness': round(olight, 3),
1120                'maxgain': round((nval - ncod) / nval, 3),
1121                'gain': round((fullsize - size) / fullsize, 3)}

generate size indicators: ol (object lightness), ul (unicity level), gain (sizegain)

Parameters

  • fullsize : int - size with full codec
  • size : int - size with existing codec

Returns : dict

class Util:
1145class Util:
1146    ''' common functions for analysis package'''
1147
1148    @staticmethod
1149    def view(field_struc, mode, ana=None):
1150        ''' return a representation of a AnaDfields structure (field, id, index).
1151
1152         *Parameters*
1153
1154        - **mode** : str - AnaDfield representation ('field', 'id', 'index')
1155        - **field_struc** : list or dict - structure to represent
1156        - **ana** : AnaDataset (default None) - to convert string or index in AnaDfield
1157        '''
1158
1159        if mode is None or not field_struc:
1160            return field_struc
1161        if isinstance(field_struc, dict):
1162            return {key: Util.view(val, mode=mode, ana=ana)
1163                    for key, val in field_struc.items()}
1164        if isinstance(field_struc, list):
1165            return [Util.view(val, mode=mode, ana=ana) for val in field_struc]
1166        if not isinstance(field_struc, AnaDfield) and mode != 'id':
1167            return Util.view(ana.dfield(field_struc), mode=mode)
1168        return field_struc if mode == 'field' else (
1169            field_struc.index if mode == 'index' else field_struc.idfield)
1170
1171    @staticmethod
1172    def reduce_dic(obj, notempty=False):
1173        '''return a dict without None values'''
1174        if isinstance(obj, dict):
1175            return {key: Util.reduce_dic(val) for key, val in obj.items()
1176                    if not val is None and (not notempty or val)}
1177        if isinstance(obj, list):
1178            return [Util.reduce_dic(val) for val in obj]
1179        return obj
1180
1181    @staticmethod
1182    def clean_dic(obj, old, new):
1183        '''return a dict or list with updated strings by replacing "old" substring
1184        with "new" substring'''
1185        if isinstance(obj, dict):
1186            return {Util.clean_dic(key, old, new): Util.clean_dic(val, old, new)
1187                    for key, val in obj.items()}
1188        if isinstance(obj, str):
1189            return obj.replace(old, new)
1190        if isinstance(obj, list):
1191            return [Util.clean_dic(val, old, new) for val in obj]
1192        return obj
1193
1194    @staticmethod
1195    def filter_dic(obj, keys):
1196        '''return extract of a list of dict or of a dict
1197
1198         *Parameters*
1199
1200        - **keys** : string, list or tuple - list of keys or single key to return
1201        if 'all' or None, all keys are returned
1202        if list, only keys in list are returned
1203        if string, only values associated to the string(key) are returned'''
1204        if not keys or keys == 'all':
1205            return obj
1206        if isinstance(obj, list):
1207            return [Util.filter_dic(dic, keys) for dic in obj]
1208        if isinstance(keys, str) and isinstance(obj, dict):
1209            return obj.get(keys, None)
1210        if isinstance(keys, (list, tuple)) and isinstance(obj, dict):
1211            return {key: val for key, val in obj.items() if key in keys}
1212        return obj

common functions for analysis package

@staticmethod
def view(field_struc, mode, ana=None):
1148    @staticmethod
1149    def view(field_struc, mode, ana=None):
1150        ''' return a representation of a AnaDfields structure (field, id, index).
1151
1152         *Parameters*
1153
1154        - **mode** : str - AnaDfield representation ('field', 'id', 'index')
1155        - **field_struc** : list or dict - structure to represent
1156        - **ana** : AnaDataset (default None) - to convert string or index in AnaDfield
1157        '''
1158
1159        if mode is None or not field_struc:
1160            return field_struc
1161        if isinstance(field_struc, dict):
1162            return {key: Util.view(val, mode=mode, ana=ana)
1163                    for key, val in field_struc.items()}
1164        if isinstance(field_struc, list):
1165            return [Util.view(val, mode=mode, ana=ana) for val in field_struc]
1166        if not isinstance(field_struc, AnaDfield) and mode != 'id':
1167            return Util.view(ana.dfield(field_struc), mode=mode)
1168        return field_struc if mode == 'field' else (
1169            field_struc.index if mode == 'index' else field_struc.idfield)

return a representation of a AnaDfields structure (field, id, index).

Parameters

  • mode : str - AnaDfield representation ('field', 'id', 'index')
  • field_struc : list or dict - structure to represent
  • ana : AnaDataset (default None) - to convert string or index in AnaDfield
@staticmethod
def reduce_dic(obj, notempty=False):
1171    @staticmethod
1172    def reduce_dic(obj, notempty=False):
1173        '''return a dict without None values'''
1174        if isinstance(obj, dict):
1175            return {key: Util.reduce_dic(val) for key, val in obj.items()
1176                    if not val is None and (not notempty or val)}
1177        if isinstance(obj, list):
1178            return [Util.reduce_dic(val) for val in obj]
1179        return obj

return a dict without None values

@staticmethod
def clean_dic(obj, old, new):
1181    @staticmethod
1182    def clean_dic(obj, old, new):
1183        '''return a dict or list with updated strings by replacing "old" substring
1184        with "new" substring'''
1185        if isinstance(obj, dict):
1186            return {Util.clean_dic(key, old, new): Util.clean_dic(val, old, new)
1187                    for key, val in obj.items()}
1188        if isinstance(obj, str):
1189            return obj.replace(old, new)
1190        if isinstance(obj, list):
1191            return [Util.clean_dic(val, old, new) for val in obj]
1192        return obj

return a dict or list with updated strings by replacing "old" substring with "new" substring

@staticmethod
def filter_dic(obj, keys):
1194    @staticmethod
1195    def filter_dic(obj, keys):
1196        '''return extract of a list of dict or of a dict
1197
1198         *Parameters*
1199
1200        - **keys** : string, list or tuple - list of keys or single key to return
1201        if 'all' or None, all keys are returned
1202        if list, only keys in list are returned
1203        if string, only values associated to the string(key) are returned'''
1204        if not keys or keys == 'all':
1205            return obj
1206        if isinstance(obj, list):
1207            return [Util.filter_dic(dic, keys) for dic in obj]
1208        if isinstance(keys, str) and isinstance(obj, dict):
1209            return obj.get(keys, None)
1210        if isinstance(keys, (list, tuple)) and isinstance(obj, dict):
1211            return {key: val for key, val in obj.items() if key in keys}
1212        return obj

return extract of a list of dict or of a dict

Parameters

  • keys : string, list or tuple - list of keys or single key to return if 'all' or None, all keys are returned if list, only keys in list are returned if string, only values associated to the string(key) are returned
class AnaError(builtins.Exception):
1215class AnaError(Exception):
1216    ''' Analysis Exception'''

Analysis Exception

Inherited Members
builtins.Exception
Exception
builtins.BaseException
with_traceback
add_note
args