tab-analysis.tab_analysis.analysis

This module analyses structure and relationships included in a tabular object (Pandas DataFrame, Dataset, list of list) :

  • Structure of a single field (class AnaField),
  • Relationship between two fields (class AnaRelation)
  • Structure and relationships of fields inside a dataset (class AnaDfield)
  • Structure of a dataset (class AnaDataset)

It contains two another classes Util, AnaError.

   1# -*- coding: utf-8 -*-
   2"""
   3This module analyses structure and relationships included in a tabular object
   4(Pandas DataFrame, Dataset, list of list) :
   5- Structure of a single field (class `AnaField`),
   6- Relationship between two fields (class `AnaRelation`)
   7- Structure and relationships of fields inside a dataset (class `AnaDfield`)
   8- Structure of a dataset (class `AnaDataset`)
   9
  10It contains two another classes `Util`, `AnaError`.
  11"""
  12import json
  13import pprint
  14from itertools import combinations
  15from operator import mul
  16from functools import reduce
  17
  18NULL = 'null'
  19UNIQUE = 'unique'
  20COMPLETE = 'complete'
  21FULL = 'full'
  22DEFAULT = 'default'
  23MIXED = 'mixed'
  24
  25COUPLED = 'coupled'
  26DERIVED = 'derived'
  27LINKED = 'linked'
  28CROSSED = 'crossed'
  29DISTRIBUTED = 'distributed'
  30ROOTED = 'rooted'
  31ROOT = 'root'
  32
  33IDFIELD = 'id'
  34MINCODEC = 'mincodec'
  35MAXCODEC = 'maxcodec'
  36LENCODEC = 'lencodec'
  37RATECODEC = 'ratecodec'
  38DMINCODEC = 'dmincodec'
  39DMAXCODEC = 'dmaxcodec'
  40RANCODEC = 'rancodec'
  41TYPECODEC = 'typecodec'
  42HASHF = 'hashf'
  43RELATION = 'relation'
  44HASHR = 'hashr'
  45DIST = 'dist'
  46DMAX = 'dmax'
  47DMIN = 'dmin'
  48DIFF = 'diff'
  49DRAN = 'dran'
  50NUM = 'num'
  51CATEGORY = 'category'
  52PDERIVED = 'pderived'
  53PDISTANCE = 'pdistance'
  54PDISTOMIN = 'pdistomin'
  55DISDISTANCE = 'disdistance'
  56DERDISTANCE = 'derdistance'
  57DISRATECPL = 'disratecpl'
  58DERRATECPL = 'derratecpl'
  59DISRATEDER = 'disrateder'
  60DERRATEDER = 'derrateder'
  61
  62TYPECOUPL = 'typecoupl'
  63PARENTCHILD = 'parentchild'
  64DISTANCE = 'distance'
  65DISTOMIN = 'distomin'
  66DISTOMAX = 'distomax'
  67DISTROOT = 'distroot'
  68RATECPL = 'ratecpl'
  69RATEDER = 'rateder'
  70
  71IDDATASET = 'name'
  72RELATIONS = 'relations'
  73FIELDS = 'fields'
  74LENGTH = 'length'
  75HASHD = 'hashd'
  76
  77
  78class AnaField:
  79    '''This class analyses field entities.
  80
  81    *Attributes*
  82
  83    - **idfield** : string - name or Id of the field
  84    - **lencodec**: integer - codec length
  85    - **mincodec**: integer - minimal codec length
  86    - **maxcodec**: integer - minimal codec length
  87    - **hashf**: integer - hash value to identify modifications
  88
  89    *characteristic (@property)*
  90
  91    - `iscomplete`
  92    - `ratecodec`
  93    - `dmincodec`
  94    - `dmaxcodec`
  95    - `rancodec`
  96    - `typecodec`
  97
  98    *instance methods*
  99
 100    - `to_dict`
 101
 102    '''
 103
 104    def __init__(self, idfield, lencodec=None, mincodec=None, maxcodec=None, hashf=None):
 105        '''Creation mode :
 106        - single dict attribute where keys are attributes name,
 107        - single AnaField attribute to make a copy
 108        - multiple attributes
 109
 110        *Parameters (multiple attributes)*
 111
 112        - **idfield** : string or integer - Id of the Field
 113        - **lencodec** : integer (default None) - length of the codec
 114        - **mincodec** : integer (default None) - number of different values
 115        - **maxcodec** : integer (default None) - length of the field
 116        - **hashf** : string (default None) - update identifier
 117        
 118        *example*
 119        
 120        AnaField is created with a dict
 121        >>> AnaField(Cfield([1,2,3,3]).to_analysis).to_dict()
 122        {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
 123        >>> AnaField({'lencodec': 4, 'mincodec': 3, 'maxcodec': 4})
 124        {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
 125        
 126        AnaField is created with parameters
 127        >>> AnaField(lencodec=4, mincodec=3, maxcodec=4).to_dict()
 128        {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}        
 129        >>> AnaField(4, 3, 4).to_dict()
 130        {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
 131        '''
 132        if isinstance(idfield, dict):
 133            self.idfield = idfield.get(IDFIELD, None)
 134            self.lencodec = idfield.get(LENCODEC, None)
 135            self.mincodec = idfield.get(MINCODEC, None)
 136            self.maxcodec = idfield.get(MAXCODEC, None)
 137            self.hashf = idfield.get(HASHF, None)
 138            return
 139        if isinstance(idfield, (AnaField, AnaDfield)):
 140            self.idfield = idfield.idfield
 141            self.lencodec = idfield.lencodec
 142            self.mincodec = idfield.mincodec
 143            self.maxcodec = idfield.maxcodec
 144            self.hashf = idfield.hashf
 145            return
 146        if not lencodec or not isinstance(lencodec, int):
 147            raise AnaError("lencodec is not correct")
 148        self.idfield = idfield
 149        self.lencodec = lencodec
 150        self.mincodec = mincodec
 151        self.maxcodec = maxcodec
 152        self.hashf = hashf
 153
 154    def __len__(self):
 155        '''length of the field (maxcodec)'''
 156        return self.maxcodec if self.maxcodec else self.lencodec
 157
 158    def __repr__(self):
 159        '''representation of the field (class name + idfield)'''
 160        return self.__class__.__name__ + '(' + str(self.idfield) + ')'
 161
 162    def __eq__(self, other):
 163        ''' equal if class and attributes are equal'''
 164        return self.__class__ .__name__ == other.__class__.__name__ and \
 165            self.idfield == other.idfield and self.lencodec == other.lencodec and \
 166            self.mincodec == other.mincodec and self.maxcodec == other.maxcodec and \
 167            self.hashf == other.hashf
 168
 169    def __lt__(self, other):
 170        ''' return a comparison between hash value'''
 171        return hash(self) < hash(other)
 172
 173    def __hash__(self):
 174        '''return hash value (sum of attributes hash)'''
 175        return hash(self.idfield) + hash(self.lencodec) + hash(self.mincodec) \
 176            + hash(self.maxcodec) + hash(self.hashf)
 177
 178    def __str__(self):
 179        '''json-text build with the attributes dict'''
 180        return json.dumps(self.to_dict(idfield=True))
 181
 182    def __copy__(self):
 183        ''' Copy all the attributes '''
 184        return self.__class__(self)
 185
 186    def to_dict(self, full=False, idfield=False, notnone=True):
 187        '''return a dict with field attributes.
 188
 189         *Parameters*
 190
 191        - **full** : boolean (default False) - if True, all the attributes are included
 192        - **idfield** : boolean (default False) - if True, idfield is included
 193        - **notnone** : boolean (default True) - if True, None values are not included
 194        '''
 195        dic = {LENCODEC: self.lencodec, MINCODEC: self.mincodec,
 196               MAXCODEC: self.maxcodec}
 197        if idfield or full:
 198            dic[IDFIELD] = self.idfield
 199        if full:
 200            dic |= {RATECODEC: self.ratecodec, DMINCODEC: self.dmincodec,
 201                    DMAXCODEC: self.dmaxcodec, RANCODEC: self.rancodec,
 202                    TYPECODEC: self.typecodec}
 203        if notnone:
 204            return Util.reduce_dic(dic)
 205        return dic
 206
 207    @property
 208    def iscomplete(self):
 209        '''return boolean indicator : True if all attributes are present'''
 210        return not self.maxcodec is None and not self.mincodec is None
 211
 212    @property
 213    def ratecodec(self):
 214        '''return float ratecodec indicator'''
 215        if self.iscomplete and self.maxcodec - self.mincodec:
 216            return (self.maxcodec - self.lencodec) / (self.maxcodec - self.mincodec)
 217        return None
 218
 219    @property
 220    def dmincodec(self):
 221        '''return integer dmincodec indicator'''
 222        return self.lencodec - self.mincodec if self.iscomplete else None
 223
 224    @property
 225    def dmaxcodec(self):
 226        '''return integer dmaxcodec indicator'''
 227        return self.maxcodec - self.lencodec if self.iscomplete else None
 228
 229    @property
 230    def rancodec(self):
 231        '''return integer rancodec indicator'''
 232        return self.maxcodec - self.mincodec if self.iscomplete else None
 233
 234    @property
 235    def typecodec(self):
 236        '''return string typecodec indicator
 237        (null, unique, complete, full, default, mixed)
 238        '''
 239        if self.maxcodec is None or self.mincodec is None:
 240            return None
 241        if self.maxcodec == 0:
 242            return NULL
 243        if self.lencodec == 1:
 244            return UNIQUE
 245        if self.mincodec == self.maxcodec:
 246            return COMPLETE
 247        if self.lencodec == self.maxcodec:
 248            return FULL
 249        if self.lencodec == self.mincodec:
 250            return DEFAULT
 251        return MIXED
 252
 253
 254class AnaRelation:
 255    '''This class analyses relationship between two fields
 256
 257    *Attributes* :
 258
 259    - **relation** : List of the two fields involved in the relationship
 260    - **dist** : value of the relationship
 261    - **distrib** : boolean True if values are distributed
 262    - **hashr**: integer - hash value to identify update
 263
 264    *global (@property)*
 265
 266    - `id_relation`
 267    - `index_relation`
 268    - `parent_child`
 269    - `typecoupl`
 270
 271    *characteristic (@property)*
 272
 273    - `dmax`
 274    - `dmin`
 275    - `diff`
 276    - `dran`
 277    - `distomin`
 278    - `distomax`
 279    - `distance`
 280    - `ratecpl`
 281    - `rateder`
 282
 283    *instance methods*
 284
 285    - `to_dict`
 286    '''
 287
 288    def __init__(self, relation, dists, hashr=None):
 289        '''Constructor of the relationship :
 290
 291         *Parameters*
 292
 293        - **relation** : List of the two fields involved in the relationship
 294        - **dists** : dist value or list of dist value and distrib boolean
 295        - **distrib** : boolean True if values are distributed
 296        - **hashr**: integer - hash value to identify update
 297        '''
 298        self.relation = relation
 299        if isinstance(dists, list):
 300            self.dist = dists[0]
 301            self.distrib = dists[1]
 302        else:
 303            self.dist = dists
 304            self.distrib = None
 305        self.hashr = hashr
 306
 307    def __repr__(self):
 308        '''representation of the field (class name + idfield)'''
 309        return self.__class__.__name__ + '(' + str(self.id_relation) + ')'
 310
 311    def __str__(self):
 312        '''json-text build with the attributes dict'''
 313        return json.dumps(self.to_dict(relation=True))
 314
 315    def __eq__(self, other):
 316        ''' equal if class and values are equal'''
 317        return self.__class__ .__name__ == other.__class__.__name__ and \
 318            self.relation == other.relation and self.dist == other.dist and \
 319            self.hashr == other.hashr and self.distrib == other.distrib
 320
 321    def __hash__(self):
 322        '''return hash value (sum of attributes hash)'''
 323        return hash(self.relation[0]) + hash(self.relation[1]) + \
 324            hash(self.dist) + hash(self.hashr) + hash(self.distrib)
 325
 326    def to_dict(self, distances=False, full=False, mode='field', relation=False,
 327                notnone=True, misc=False):
 328        '''return a dict with AnaRelation attributes.
 329
 330         *Parameters*
 331
 332        - **distances** : boolean (default False) - if True, distances indicators are included
 333        - **full** : boolean (default False) - if True, all the attributes are included
 334        - **relation** : boolean (default False) - if True, idfield are included
 335        - **notnone** : boolean (default True) - if True, None values are not included
 336        - **mode** : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
 337        '''
 338        dic = {DIST: self.dist, TYPECOUPL: self.typecoupl, HASHR: self.hashr}
 339        if relation or full:
 340            dic[RELATION] = Util.view(self.relation, mode)
 341            #dic[TYPECOUPL] = self.typecoupl
 342            dic[PARENTCHILD] = self.parent_child
 343        if distances or full:
 344            dic |= {DISTANCE: self.distance, DISTOMIN: self.distomin,
 345                    DISTOMAX: self.distomax, DISTRIBUTED: self.distrib,
 346                    RATECPL: self.ratecpl, RATEDER: self.rateder}
 347        if misc or full:
 348            dic |= {DMAX: self.dmax, DMIN: self.dmin,
 349                    DIFF: self.diff, DRAN: self.dran}
 350        if notnone:
 351            return Util.reduce_dic(dic)
 352        return dic
 353
 354    @property
 355    def id_relation(self):
 356        '''return a list with the id of the two fields involved'''
 357        if self.relation:
 358            return [fld.idfield for fld in self.relation]
 359        return []
 360
 361    @property
 362    def parent_child(self):
 363        '''returns the direction of the relationship (True if parent is first)'''
 364        rel0 = self.relation[0]
 365        rel1 = self.relation[1]
 366        # if isinstance(rel0, AnaDfield) and isinstance(rel1, AnaDfield):
 367        return (rel0.lencodec > rel1.lencodec or
 368                (rel0.lencodec == rel1.lencodec and rel0.index < rel1.index))
 369        # return None
 370
 371    @property
 372    def index_relation(self):
 373        '''return a list with the index of the two fields involved'''
 374        if self.relation:
 375            return [fld.index for fld in self.relation]
 376        return []
 377
 378    @property
 379    def dmax(self):
 380        '''return integer dmax indicator'''
 381        return self.relation[0].lencodec * self.relation[1].lencodec
 382
 383    @property
 384    def dmin(self):
 385        '''return integer dmin indicator'''
 386        return max(self.relation[0].lencodec, self.relation[1].lencodec)
 387
 388    @property
 389    def diff(self):
 390        '''return integer diff indicator'''
 391        return abs(self.relation[0].lencodec - self.relation[1].lencodec)
 392
 393    @property
 394    def dran(self):
 395        '''return integer dran indicator'''
 396        return self.dmax - self.dmin
 397
 398    @property
 399    def distomin(self):
 400        '''return integer distomin indicator'''
 401        return self.dist - self.dmin
 402
 403    @property
 404    def distomax(self):
 405        '''return integer distomax indicator'''
 406        return self.dmax - self.dist
 407
 408    @property
 409    def distance(self):
 410        '''return integer distance indicator'''
 411        return self.distomin + self.diff
 412
 413    @property
 414    def ratecpl(self):
 415        '''return float ratecpl indicator'''
 416        disdis = self.distance + self.distomax
 417        return 0 if disdis == 0 else self.distance / disdis
 418
 419    @property
 420    def rateder(self):
 421        '''return float rateder indicator'''
 422        return 0 if self.dran == 0 else self.distomin / self.dran
 423
 424    @property
 425    def typecoupl(self):
 426        '''return relationship type (coupled, derived, crossed, linked)'''
 427        if self.distance == 0:
 428            return COUPLED
 429        if self.distomin == 0:
 430            return DERIVED
 431        if self.distomax == 0:
 432            return CROSSED
 433        return LINKED
 434
 435
 436class AnaDfield(AnaField):
 437    '''This class analyses structure and relationships of fields inside a dataset
 438
 439    *Attributes* :
 440
 441    - **dataset** : AnaDataset object where AnaDfield is included
 442    - **AnaField attributes** : inheritance of AnaField object
 443
 444    *relationship (@property)*
 445
 446    - `list_relations`
 447    - `list_p_derived`
 448    - `list_c_derived`
 449    - `list_coupled`
 450
 451    *field (@property)*
 452
 453    - `fields`
 454    - `p_derived`
 455    - `p_distance`
 456    - `p_distomin`
 457
 458    *global (@property)*
 459
 460    - `index`
 461    - `dist_root`
 462    - `category`
 463
 464    *global (instance methods)*
 465
 466    - `ascendants`
 467    - `to_dict`
 468    - `view`
 469
 470    *other instance methods*
 471
 472    - `dic_inner_node`
 473    '''
 474    def __new__(cls, other, dataset=None):
 475        '''initialization of attributes from "other"'''
 476        if isinstance(other, AnaDfield):
 477            new = AnaDfield.__copy__(other)
 478            return new
 479        if isinstance(other, AnaField):
 480            new = AnaField.__copy__(other)
 481            new.__class__ = AnaDfield
 482            return new
 483        return object.__new__(cls)
 484
 485    def __init__(self, other, dataset):
 486        '''AnaDfield is created by adding a AnaDataset link to an AnaField object.
 487
 488         *Parameters*
 489
 490        - **other** : AnaField or AnaDfield to initialize attributes
 491        - **dataset** : AnaDataset which includes the AnaDfield
 492        '''
 493        self.dataset = dataset
 494
 495    def __copy__(self):
 496        ''' Copy all the data '''
 497        return self.__class__(AnaField(self), self.dataset)
 498
 499    def __lt__(self, other):
 500        ''' return a comparison between field index'''
 501        return self.index < other.index
 502
 503    @property
 504    def index(self):
 505        '''return the row of the field in the AnaDataset'''
 506        if self == self.dataset.root:
 507            return -1
 508        return self.dataset.fields.index(self)
 509
 510    @property
 511    def fields(self):
 512        '''return the list of the fields included in the AnaDataset'''
 513        return self.dataset.fields
 514
 515    @property
 516    def list_relations(self):
 517        '''return the list of the relations with the AnaDfield'''
 518        return list(self.dataset.relations[self].values())
 519
 520    @property
 521    def list_p_derived(self):
 522        '''return the list of the derived relations with the parents of AnaDfield'''
 523        return [rel for rel in self.list_relations if rel.typecoupl == DERIVED
 524                and not rel.parent_child]
 525
 526    @property
 527    def list_c_derived(self):
 528        '''return the list of the derived relations with the childs of AnaDfield'''
 529        return [rel for rel in self.list_relations if rel.typecoupl == DERIVED
 530                and rel.parent_child
 531                and rel.relation[1].category != UNIQUE]
 532
 533    @property
 534    def list_coupled(self):
 535        '''return the list of the coupled relations with the AnaDfield'''
 536        return [rel for rel in self.list_relations if rel.typecoupl == COUPLED]
 537
 538    @property
 539    def dist_root(self):
 540        '''return the distance to the root field'''
 541        return len(self.dataset) - self.lencodec
 542
 543    @property
 544    def category(self):
 545        '''return AnaDfield category (unique, rooted, coupled, derived, mixed)'''
 546        if self.typecodec == UNIQUE:
 547            return UNIQUE
 548        if self.typecodec in (COMPLETE, FULL):
 549            return ROOTED
 550        if COUPLED in [rel.typecoupl for rel in self.list_relations
 551                       if not rel.parent_child]:
 552            return COUPLED
 553        if not self.list_c_derived:
 554            return DERIVED
 555        return MIXED
 556
 557    @property
 558    def p_derived(self):
 559        '''return the first derived or coupled parent of the AnaDfield'''
 560        if self.category in (UNIQUE, ROOTED):
 561            return self.dataset.root
 562        if self.category == COUPLED:
 563            return [rel.relation[1] for rel in self.list_coupled
 564                    if not rel.relation[1].category == COUPLED][0]
 565        if not self.list_p_derived:
 566            return self.dataset.root
 567        distance_min = min(rel.distance for rel in self.list_p_derived)
 568        for rel in self.list_p_derived:
 569            if rel.distance == distance_min:
 570                if rel.relation[1].category == ROOTED:
 571                    return self.dataset.root
 572                if rel.relation[1].category == MIXED:
 573                    return rel.relation[1]
 574        return self.dataset.root
 575
 576    @property
 577    def p_distance(self):
 578        '''return the first parent with minimal distance of the AnaDfield'''
 579        return self._p_min_dist()
 580
 581    @property
 582    def p_distomin(self):
 583        '''return the first parent with minimal distomin of the AnaDfield'''
 584        return self._p_min_dist(False)
 585
 586    def _p_min_dist(self, distance=True):
 587        '''return the parent with minimal distance of the AnaDfield'''
 588        if self.category == UNIQUE:
 589            return self.dataset.root
 590        if distance:
 591            dist_up = [rel.distance for rel in self.list_relations if
 592                       not rel.parent_child]
 593            # not rel.parent_child and rel.relation[1].category != COUPLED]
 594        else:
 595            dist_up = [rel.distomin for rel in self.list_relations if
 596                       not rel.parent_child]
 597            # not rel.parent_child and rel.relation[1].category != COUPLED]
 598        if not dist_up or min(dist_up) == self.dist_root:
 599            return self.dataset.root
 600        dist_min = min(dist_up)
 601        if distance:
 602            list_dmin = [rel.relation[1] for rel in self.list_relations
 603                         if rel.distance == dist_min]
 604            # if rel.distance == dist_min and not rel.parent_child]
 605        else:
 606            list_dmin = [rel.relation[1] for rel in self.list_relations
 607                         if rel.distomin == dist_min]
 608            # if rel.distomin == dist_min and not rel.parent_child]
 609        max_lencodec = max(fld.lencodec for fld in list_dmin)
 610        return [fld for fld in list_dmin if fld.lencodec == max_lencodec][0]
 611
 612    def to_dict(self, mode='id'):
 613        '''return a dict with field attributes.
 614
 615         *Parameters*
 616
 617        - **mode** : str (default 'id') - AnaDfield representation ('field', 'id', 'index')
 618        '''
 619        dic = super().to_dict(full=True, notnone=False)
 620        dic[DISTROOT] = self.dist_root
 621        dic[NUM] = self.index
 622        dic[CATEGORY] = self.category
 623        dic[PDISTANCE] = self.p_distance.view(mode)
 624        dic[PDISTOMIN] = self.p_distomin.view(mode)
 625        dic[PDERIVED] = self.p_derived.view(mode)
 626        return dic
 627
 628    def view(self, mode='field'):
 629        ''' return a representation of the AnaDfield
 630
 631         *Parameters*
 632
 633        - **mode** : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
 634        '''
 635        return Util.view(self, mode)
 636
 637    def ascendants(self, typeparent='derived', mode='field'):
 638        ''' return the list of the AnaDfield's ascendants in the family tree up to
 639        the root AnaDfield.
 640
 641         *Parameters*
 642
 643        - **typeparent** : str (default 'derived') - 'derived', 'distance' or 'distomin'
 644        - **mode** : str (default 'field') - AnaDfield representation
 645        ('field', 'id', 'index')
 646
 647        *Returns* : list of parents from closest to the most distant. Parents
 648        are represented with index, idfield, or object
 649        '''
 650        parent = self
 651        listparent = []
 652        while parent != self.dataset.root:
 653            if typeparent == 'derived':
 654                parent = parent.p_derived
 655            elif typeparent == 'distance':
 656                parent = parent.p_distance
 657            else:
 658                parent = parent.p_distomin
 659            if parent != self.dataset.root:
 660                listparent.append(parent)
 661        return Util.view(listparent, mode)
 662
 663    def dic_inner_node(self, mode, lname):
 664        '''return a child AnaDfield tree.
 665
 666         *Parameters*
 667
 668        - **lname** : integer - maximal length of the names
 669        - **mode** : string (default 'derived') - kind of tree :
 670            'derived' : derived tree
 671            'distance': min distance tree
 672            'distomin': min distomin tree
 673
 674        *Returns* : dict where key is a AnaDfield and value is the list of
 675        the childs.
 676        '''
 677        adding = ''
 678        if mode == 'distance':
 679            rel_parent = self.dataset.get_relation(self, self.p_distance)
 680            adding = str(rel_parent.distance) + ' - '
 681        elif mode == 'distomin':
 682            rel_parent = self.dataset.get_relation(self, self.p_distomin)
 683            adding = str(rel_parent.distomin) + ' - '
 684        elif mode == 'derived':
 685            rel_parent = self.dataset.get_relation(self, self.p_derived)
 686            adding = str(rel_parent.distance) + ' - '
 687        adding += str(self.lencodec)
 688        name = str(self.idfield)[:lname] + ' (' + adding + ')'
 689        lis = [name.replace(' ', '*').replace("'", '*')]
 690        if mode == 'derived':
 691            childs = []
 692            #if not self.category in (ROOTED, COUPLED):
 693            if not self.category in (ROOTED, COUPLED, UNIQUE):
 694                for rel in self.list_coupled:
 695                    lis.append(rel.relation[1].dic_inner_node(mode, lname))
 696            if not self.category in (ROOTED, UNIQUE):
 697                childs = [rel.relation[1] for rel in self.list_relations
 698                          if rel.relation[1].p_derived == self and
 699                          rel.relation[1].category != COUPLED]
 700        if mode == 'distomin':
 701            childs = [rel.relation[1] for rel in self.list_relations
 702                      if rel.relation[1].p_distomin == self]
 703        if mode == 'distance':
 704            childs = [rel.relation[1] for rel in self.list_relations
 705                      if rel.relation[1].p_distance == self]
 706        for fld in childs:
 707            lis.append(fld.dic_inner_node(mode, lname))
 708        return {str(self.index).ljust(2, '*'): lis}
 709
 710
 711class AnaDataset:
 712    '''This class analyses the structure of a dataset.
 713
 714    *Attributes* :
 715
 716    - **iddataset** : string or integer - Id of the Dataset
 717    - **fields** : list of the AnaDfields included
 718    - **relations** : dict of the AnaRelations between two AnaDfields
 719    - **hashd** : string - update identifier
 720
 721    *relationship (@property)*
 722
 723    - `ana_relations`
 724    - `p_relations`
 725
 726    *field (@property)*
 727
 728    - `root`
 729    - `primary`
 730    - `secondary`
 731    - `unique`
 732    - `variable`
 733
 734    *global (@property)*
 735
 736    - `category`
 737    - `complete`
 738    - `dimension`
 739
 740    *update (instance methods)*
 741
 742    - `set_relations`
 743
 744
 745    *access (instance methods)*
 746
 747    - `get_relation`
 748    - `dfield`
 749
 750    *synthesis (instance methods)*
 751
 752    - `tree`
 753    - `to_dict`
 754    - `indicator`
 755    - `partitions`
 756    - `field_partition`
 757    '''
 758
 759    def __init__(self, fields=None, relations=None, iddataset=None,
 760                 leng=None, hashd=None):
 761        '''Creation mode :
 762        - single dict attribute where keys are attributes name,
 763        - single AnaDataset attribute to make a copy
 764        - multiple attributes
 765
 766         *Parameters (multiple attributes)*
 767
 768        - **idfield** : string or integer - Id of the Field
 769        - **lencodec** : integer (default None) - length of the codec
 770        - **mincodec** : integer (default None) - number of different values
 771        - **maxcodec** : integer (default None) - length of the field
 772        - **hashf** : string (default None) - update identifier
 773        '''
 774        if isinstance(fields, AnaDataset):
 775            self.iddataset = fields.iddataset
 776            self.fields = fields.fields
 777            self.relations = fields.relations
 778            self.hashd = fields.hashd
 779            return
 780        if isinstance(fields, dict):
 781            iddataset = fields.get(IDDATASET, None)
 782            leng = fields.get(LENGTH, None)
 783            relations = fields.get(RELATIONS, None)
 784            hashd = fields.get(HASHD)
 785            fields = fields.get(FIELDS, None)
 786        self.iddataset = iddataset
 787        self.fields = [AnaDfield(AnaField(field), self)
 788                       for field in fields] if fields else []
 789        if leng:
 790            for fld in self.fields:
 791                fld.maxcodec = leng
 792        self.relations = {field: {} for field in self.fields}
 793        if relations:
 794            for fld, dic_relation in relations.items():
 795                self.set_relations(fld, dic_relation)
 796        self.hashd = hashd
 797
 798    def __len__(self):
 799        '''length of the AnaDataset (len of the AnaDfields included)'''
 800        return max(len(fld) for fld in self.fields)
 801
 802    def __eq__(self, other):
 803        ''' equal if class and values are equal'''
 804        return self.__class__ .__name__ == other.__class__.__name__ and \
 805            self.fields == other.fields and self.relations == other.relations and \
 806            self.iddataset == other.iddataset and self.hashd == other.hashd
 807
 808    def __hash__(self):
 809        '''return hash value (sum of attributes hash)'''
 810        return hash(self.iddataset) + sum(hash(fld) for fld in self.fields) + \
 811            sum(hash(rel) for rel in self.relations) + hash(self.hashd)
 812
 813    @property
 814    def category(self):
 815        '''return a list of AnaDfield category (unique, rooted, coupled, derived, mixed)'''
 816        return [fld.category for fld in self.fields]
 817
 818    @property
 819    def ana_relations(self):
 820        '''return the list of AnaRelation included'''
 821        return [rel for fldrel in self.relations.values() for rel in fldrel.values()]
 822
 823    @property
 824    def p_relations(self):
 825        '''return the list of oriented AnaRelation (parent first, child second)'''
 826        return [rel for rel in self.ana_relations if rel.parent_child]
 827
 828    @property
 829    def root(self):
 830        '''return the root AnaDfield'''
 831        len_self = len(self)
 832        return AnaDfield(AnaField(ROOT, len_self, len_self, len_self), self)
 833
 834    @property
 835    def primary(self):
 836        '''return the first partition of the partitions'''
 837        part = self.partitions(distributed=True)
 838        return part[0] if part else []
 839
 840    @property
 841    def complete(self):
 842        '''return True if the dimension is not 0'''
 843        return self.dimension > 0
 844
 845    @property
 846    def dimension(self):
 847        '''return the highest partition lenght'''
 848        return len(self.primary)
 849
 850    @property
 851    def secondary(self):
 852        '''return the derived ou coupled fields from primary'''
 853        secondary = []
 854        for field in self.primary:
 855            self._add_child(field, secondary)
 856        return [fld for fld in secondary if not fld in self.primary]
 857
 858    @property
 859    def unique(self):
 860        '''return the unique fields'''
 861        return [fld for fld in self.fields if fld.category == UNIQUE]
 862
 863    @property
 864    def variable(self):
 865        '''return the variable fields'''
 866        return [fld for fld in self.fields
 867                if not fld in self.primary + self.secondary + self.unique]
 868
 869    def set_relations(self, field, dic_relations):
 870        '''Add relations in the AnaDataset from a dict.
 871
 872         *Parameters*
 873
 874        - **field** : AnaDfield, AnaField or str (idfield) - first relation AnaDfield
 875        - **dic_relations** : dict - key is the second relation AnaDfield and
 876        value is the dist value or teh list [dist, distrib]
 877        '''
 878        fld = self.dfield(field)
 879        for other, dist in dic_relations.items():
 880            oth = self.dfield(other)
 881            self.relations[fld][oth] = AnaRelation([fld, oth], dist)
 882            self.relations[oth][fld] = AnaRelation([oth, fld], dist)
 883
 884    def get_relation(self, fld1, fld2):
 885        '''Return AnaRelation between fld1 and fld2.
 886
 887         *Parameters*
 888
 889        - **fld1** : AnaDfield, AnaField, int or str (idfield) - first relation AnaDfield
 890        - **fld2** : AnaDfield, AnaField, int or str (idfield) - second relation AnaDfield
 891        '''
 892        fl1 = self.dfield(fld1)
 893        fl2 = self.dfield(fld2)
 894        if self.root in [fl1, fl2]:
 895            return AnaRelation([fl1, fl2], len(self))
 896        return self.relations[self.dfield(fld1)][self.dfield(fld2)]
 897
 898    def dfield(self, fld):
 899        '''return the AnaDfield matching with fld. Fld is str, int, AnaDfield or AnaField'''
 900        if fld in (-1, ROOT):
 901            return self.root
 902        if isinstance(fld, AnaDfield):
 903            return fld
 904        if isinstance(fld, int):
 905            return self.fields[fld]
 906        if isinstance(fld, str):
 907            if fld in [dfld.idfield for dfld in self.fields]:
 908                return [dfld for dfld in self.fields if dfld.idfield == fld][0]
 909            # return self.root
 910            return None
 911        return AnaDfield(fld, self)
 912
 913    def tree(self, mode='derived', width=5, lname=20, string=True):
 914        '''return a string with a tree of derived Field.
 915
 916         *Parameters*
 917
 918        - **lname** : integer (default 20) - length of the names
 919        - **width** : integer (default 5) - length of the lines
 920        - **string** : boolean (default True) - if True return str else return dict
 921        - **mode** : string (default 'derived') - kind of tree :
 922            'derived' : derived tree
 923            'distance': min distance tree
 924            'distomin': min distomin tree
 925        '''
 926        lis = ['root-' + mode + '*(' + str(len(self)) + ')']
 927        if mode == 'distance':
 928            childs = [fld for fld in self.fields if fld.p_distance == self.root]
 929        elif mode == 'distomin':
 930            childs = [fld for fld in self.fields if fld.p_distomin == self.root]
 931        elif mode == 'derived':
 932            childs = [fld for fld in self.fields if fld.p_derived == self.root]
 933        for fld in childs:
 934            lis.append(fld.dic_inner_node(mode, lname))
 935        tree = {str(-1).ljust(2, '*'): lis}
 936        if string:
 937            tre = pprint.pformat(tree, indent=0, width=width)
 938            tre = tre.replace('---', ' - ')
 939            tre = tre.replace('  ', ' ')
 940            tre = tre.replace('*', ' ')
 941            for car in ["'", "\"", "{", "[", "]", "}", ","]:
 942                tre = tre.replace(car, "")
 943            return tre
 944        return Util.clean_dic(tree, '*', ' ')
 945
 946    def to_dict(self, mode='field', keys=None, relations=False):
 947        '''return a dict with fields attributes and optionaly relations attributes.
 948
 949         *Parameters*
 950
 951        - **mode** : str (default 'field') - AnaDfield representation
 952        ('field', 'id', 'index')
 953        - **relations** : boolean (default: False) - if False return a list of fields,
 954        if True return a dict '{"fields": <list of fields>, "relations": <list of relations>}'
 955        - **keys** : string, list or tuple - list of keys or single key to return
 956        if 'all' or None, all keys are returned
 957        if list, only keys in list are returned
 958        if string, only values associated to the string(key) are returned'''
 959        fields = Util.filter_dic([fld.to_dict(mode=mode)
 960                                 for fld in self.fields], keys)
 961        leng = len(self.fields)
 962        if not relations:
 963            return fields
 964        return {'fields': fields, 'relations':
 965                [self.get_relation(i, j).to_dict(full=True, mode=mode)
 966                 for i in range(-1, leng) for j in range(i + 1, leng)]}
 967
 968    def partitions(self, mode='field', distributed=True):
 969        '''return a list of available partitions (the first is highest).
 970
 971         *Parameters*
 972
 973        - **mode** : str (default 'field') - AnaDfield representation
 974        ('field', 'id', 'index')
 975        - **distributed** : boolean (default True) - Include only distributed fields
 976        '''
 977        partit = [[fld] for fld in self.fields if fld.category == ROOTED]
 978        crossed = [rel for rel in self.ana_relations if rel.typecoupl == CROSSED
 979                   # and rel.relation[1].index > rel.relation[0].index
 980                   and rel.parent_child
 981                   and rel.relation[0].category != COUPLED
 982                   and rel.relation[1].category != COUPLED]
 983        if distributed:
 984            crossed = [rel for rel in crossed if rel.distrib]
 985        if crossed and len(crossed) == 1 and crossed[0].dist == len(self):
 986            partit.insert(0, crossed[0].relation)
 987        elif crossed:
 988            for repeat in list(range(len(crossed))):
 989                candidates = combinations(crossed, repeat + 1)
 990                for candidat in candidates:
 991                    flds = list(set(rel.relation[i]
 992                                for rel in candidat for i in [0, 1]))
 993                    if (reduce(mul, [fld.lencodec for fld in flds]) == len(self) and
 994                        len(candidat) == sum(range(len(flds))) and
 995                            (not distributed or min(rel.distrib for rel in candidat))):
 996                        partit.insert(0, flds)
 997        partit = Util.view(partit, mode)
 998        return [list(tup) for tup in
 999                sorted(sorted(list({tuple(sorted(prt)) for prt in partit})),
1000                       key=len, reverse=True)]
1001
1002    def field_partition(self, mode='field', partition=None, distributed=True):
1003        '''return a partition dict with the list of primary, secondary, unique
1004        and variable fields.
1005
1006         *Parameters*
1007
1008        - **mode** : str (default 'field') - AnaDfield representation
1009        ('field', 'id', 'index')
1010        - **partition** : list (default None) - if None, partition is the first
1011        - **distributed** : boolean (default True) - Include only distributed fields
1012        '''
1013        if not partition:
1014            partitions = self.partitions(distributed=distributed)
1015            if not partitions:
1016                return {'primary': [], 'secondary': [], 'unique': [], 'variable': []}
1017            partition = partitions[0]
1018        else:
1019            partition = [self.dfield(fld) for fld in partition]
1020        secondary = []
1021        for field in partition:
1022            self._add_child(field, secondary)
1023        secondary = [fld for fld in secondary if not fld in partition]
1024        unique = [fld for fld in self.fields if fld.category == UNIQUE]
1025        variable = [fld for fld in self.fields
1026                    if not fld in partition + secondary + unique]
1027        return Util.view({'primary': partition, 'secondary': secondary,
1028                          'unique': unique, 'variable': variable}, mode)
1029
1030    def indicator(self, fullsize, size):
1031        '''generate size indicators: ol (object lightness), ul (unicity level),
1032        gain (sizegain)
1033
1034        *Parameters*
1035
1036        - **fullsize** : int - size with full codec
1037        - **size** : int - size with existing codec
1038
1039        *Returns* : dict'''
1040        lenindex = len(self.fields)
1041        indexlen = sum(fld.lencodec for fld in self.fields)
1042        nval = len(self) * (lenindex + 1)
1043        sval = fullsize / nval
1044        ncod = indexlen + lenindex
1045
1046        if nval != ncod:
1047            scod = (size - ncod * sval) / (nval - ncod)
1048            olight = scod / sval
1049        else:
1050            olight = None
1051        return {'total values': nval, 'mean size': round(sval, 3),
1052                'unique values': ncod, 'mean coding size': round(scod, 3),
1053                'unicity level': round(ncod / nval, 3),
1054                'optimize level': round(size / fullsize, 3),
1055                'object lightness': round(olight, 3),
1056                'maxgain': round((nval - ncod) / nval, 3),
1057                'gain': round((fullsize - size) / fullsize, 3)}
1058
1059    def _add_child(self, field, childs):
1060        ''' add derived or coupled fields in the childs list'''
1061        for rel in field.list_c_derived + field.list_coupled:
1062            child = rel.relation[1]
1063            if not child in childs and not child.category == UNIQUE:
1064                childs.append(child)
1065                if not child.category in (COUPLED, UNIQUE):
1066                    self._add_child(child, childs)
1067
1068
1069class Util:
1070    ''' common functions for analysis package'''
1071
1072    @staticmethod
1073    def view(field_struc, mode):
1074        ''' return a representation of a AnaDfields structure (fields, id, index).
1075
1076         *Parameters*
1077
1078        - **mode** : str - AnaDfield representation ('field', 'id', 'index')
1079        - **field_struc** : list or dict - structure to represent
1080        '''
1081        if mode is None or mode == 'field' or not field_struc:
1082            return field_struc
1083        if isinstance(field_struc, dict):
1084            return {key: [fld.idfield if mode == 'id' else fld.index for fld in val]
1085                    for key, val in field_struc.items()}
1086        if isinstance(field_struc, list) and isinstance(field_struc[0], list):
1087            return [[fld.idfield if mode == 'id' else fld.index for fld in val]
1088                    for val in field_struc]
1089        if isinstance(field_struc, list):
1090            return [fld.idfield if mode == 'id' else fld.index for fld in field_struc]
1091        if isinstance(field_struc, AnaField):
1092            return field_struc.idfield if mode == 'id' else field_struc.index
1093        return field_struc
1094
1095    @staticmethod
1096    def reduce_dic(obj):
1097        '''return a dict without None values'''
1098        if isinstance(obj, dict):
1099            return {key: Util.reduce_dic(val) for key, val in obj.items() 
1100                    if not val is None}
1101        if isinstance(obj, list):
1102            return [Util.reduce_dic(val) for val in obj]
1103        return obj
1104    
1105    @staticmethod
1106    def clean_dic(obj, old, new):
1107        '''return a dict or list with updated strings by replacing "old" substring
1108        with "new" substring'''
1109        if isinstance(obj, dict):
1110            return {Util.clean_dic(key, old, new): Util.clean_dic(val, old, new)
1111                    for key, val in obj.items()}
1112        if isinstance(obj, str):
1113            return obj.replace(old, new)
1114        if isinstance(obj, list):
1115            return [Util.clean_dic(val, old, new) for val in obj]
1116        return obj
1117
1118    @staticmethod
1119    def filter_dic(obj, keys):
1120        '''return extract of a list of dict or of a dict
1121
1122         *Parameters*
1123
1124        - **keys** : string, list or tuple - list of keys or single key to return
1125        if 'all' or None, all keys are returned
1126        if list, only keys in list are returned
1127        if string, only values associated to the string(key) are returned'''
1128        if not keys or keys == 'all':
1129            return obj
1130        if isinstance(obj, list):
1131            return [Util.filter_dic(dic, keys) for dic in obj]
1132        if isinstance(keys, str) and isinstance(obj, dict):
1133            return obj.get(keys, None)
1134        if isinstance(keys, (list, tuple)) and isinstance(obj, dict):
1135            return {key: val for key, val in obj.items() if key in keys}
1136        return obj
1137
1138
1139class AnaError(Exception):
1140    ''' Analysis Exception'''
1141    # pass
class AnaField:
 79class AnaField:
 80    '''This class analyses field entities.
 81
 82    *Attributes*
 83
 84    - **idfield** : string - name or Id of the field
 85    - **lencodec**: integer - codec length
 86    - **mincodec**: integer - minimal codec length
 87    - **maxcodec**: integer - minimal codec length
 88    - **hashf**: integer - hash value to identify modifications
 89
 90    *characteristic (@property)*
 91
 92    - `iscomplete`
 93    - `ratecodec`
 94    - `dmincodec`
 95    - `dmaxcodec`
 96    - `rancodec`
 97    - `typecodec`
 98
 99    *instance methods*
100
101    - `to_dict`
102
103    '''
104
105    def __init__(self, idfield, lencodec=None, mincodec=None, maxcodec=None, hashf=None):
106        '''Creation mode :
107        - single dict attribute where keys are attributes name,
108        - single AnaField attribute to make a copy
109        - multiple attributes
110
111        *Parameters (multiple attributes)*
112
113        - **idfield** : string or integer - Id of the Field
114        - **lencodec** : integer (default None) - length of the codec
115        - **mincodec** : integer (default None) - number of different values
116        - **maxcodec** : integer (default None) - length of the field
117        - **hashf** : string (default None) - update identifier
118        
119        *example*
120        
121        AnaField is created with a dict
122        >>> AnaField(Cfield([1,2,3,3]).to_analysis).to_dict()
123        {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
124        >>> AnaField({'lencodec': 4, 'mincodec': 3, 'maxcodec': 4})
125        {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
126        
127        AnaField is created with parameters
128        >>> AnaField(lencodec=4, mincodec=3, maxcodec=4).to_dict()
129        {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}        
130        >>> AnaField(4, 3, 4).to_dict()
131        {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
132        '''
133        if isinstance(idfield, dict):
134            self.idfield = idfield.get(IDFIELD, None)
135            self.lencodec = idfield.get(LENCODEC, None)
136            self.mincodec = idfield.get(MINCODEC, None)
137            self.maxcodec = idfield.get(MAXCODEC, None)
138            self.hashf = idfield.get(HASHF, None)
139            return
140        if isinstance(idfield, (AnaField, AnaDfield)):
141            self.idfield = idfield.idfield
142            self.lencodec = idfield.lencodec
143            self.mincodec = idfield.mincodec
144            self.maxcodec = idfield.maxcodec
145            self.hashf = idfield.hashf
146            return
147        if not lencodec or not isinstance(lencodec, int):
148            raise AnaError("lencodec is not correct")
149        self.idfield = idfield
150        self.lencodec = lencodec
151        self.mincodec = mincodec
152        self.maxcodec = maxcodec
153        self.hashf = hashf
154
155    def __len__(self):
156        '''length of the field (maxcodec)'''
157        return self.maxcodec if self.maxcodec else self.lencodec
158
159    def __repr__(self):
160        '''representation of the field (class name + idfield)'''
161        return self.__class__.__name__ + '(' + str(self.idfield) + ')'
162
163    def __eq__(self, other):
164        ''' equal if class and attributes are equal'''
165        return self.__class__ .__name__ == other.__class__.__name__ and \
166            self.idfield == other.idfield and self.lencodec == other.lencodec and \
167            self.mincodec == other.mincodec and self.maxcodec == other.maxcodec and \
168            self.hashf == other.hashf
169
170    def __lt__(self, other):
171        ''' return a comparison between hash value'''
172        return hash(self) < hash(other)
173
174    def __hash__(self):
175        '''return hash value (sum of attributes hash)'''
176        return hash(self.idfield) + hash(self.lencodec) + hash(self.mincodec) \
177            + hash(self.maxcodec) + hash(self.hashf)
178
179    def __str__(self):
180        '''json-text build with the attributes dict'''
181        return json.dumps(self.to_dict(idfield=True))
182
183    def __copy__(self):
184        ''' Copy all the attributes '''
185        return self.__class__(self)
186
187    def to_dict(self, full=False, idfield=False, notnone=True):
188        '''return a dict with field attributes.
189
190         *Parameters*
191
192        - **full** : boolean (default False) - if True, all the attributes are included
193        - **idfield** : boolean (default False) - if True, idfield is included
194        - **notnone** : boolean (default True) - if True, None values are not included
195        '''
196        dic = {LENCODEC: self.lencodec, MINCODEC: self.mincodec,
197               MAXCODEC: self.maxcodec}
198        if idfield or full:
199            dic[IDFIELD] = self.idfield
200        if full:
201            dic |= {RATECODEC: self.ratecodec, DMINCODEC: self.dmincodec,
202                    DMAXCODEC: self.dmaxcodec, RANCODEC: self.rancodec,
203                    TYPECODEC: self.typecodec}
204        if notnone:
205            return Util.reduce_dic(dic)
206        return dic
207
208    @property
209    def iscomplete(self):
210        '''return boolean indicator : True if all attributes are present'''
211        return not self.maxcodec is None and not self.mincodec is None
212
213    @property
214    def ratecodec(self):
215        '''return float ratecodec indicator'''
216        if self.iscomplete and self.maxcodec - self.mincodec:
217            return (self.maxcodec - self.lencodec) / (self.maxcodec - self.mincodec)
218        return None
219
220    @property
221    def dmincodec(self):
222        '''return integer dmincodec indicator'''
223        return self.lencodec - self.mincodec if self.iscomplete else None
224
225    @property
226    def dmaxcodec(self):
227        '''return integer dmaxcodec indicator'''
228        return self.maxcodec - self.lencodec if self.iscomplete else None
229
230    @property
231    def rancodec(self):
232        '''return integer rancodec indicator'''
233        return self.maxcodec - self.mincodec if self.iscomplete else None
234
235    @property
236    def typecodec(self):
237        '''return string typecodec indicator
238        (null, unique, complete, full, default, mixed)
239        '''
240        if self.maxcodec is None or self.mincodec is None:
241            return None
242        if self.maxcodec == 0:
243            return NULL
244        if self.lencodec == 1:
245            return UNIQUE
246        if self.mincodec == self.maxcodec:
247            return COMPLETE
248        if self.lencodec == self.maxcodec:
249            return FULL
250        if self.lencodec == self.mincodec:
251            return DEFAULT
252        return MIXED

This class analyses field entities.

Attributes

  • idfield : string - name or Id of the field
  • lencodec: integer - codec length
  • mincodec: integer - minimal codec length
  • maxcodec: integer - minimal codec length
  • hashf: integer - hash value to identify modifications

characteristic (@property)

instance methods

AnaField(idfield, lencodec=None, mincodec=None, maxcodec=None, hashf=None)
105    def __init__(self, idfield, lencodec=None, mincodec=None, maxcodec=None, hashf=None):
106        '''Creation mode :
107        - single dict attribute where keys are attributes name,
108        - single AnaField attribute to make a copy
109        - multiple attributes
110
111        *Parameters (multiple attributes)*
112
113        - **idfield** : string or integer - Id of the Field
114        - **lencodec** : integer (default None) - length of the codec
115        - **mincodec** : integer (default None) - number of different values
116        - **maxcodec** : integer (default None) - length of the field
117        - **hashf** : string (default None) - update identifier
118        
119        *example*
120        
121        AnaField is created with a dict
122        >>> AnaField(Cfield([1,2,3,3]).to_analysis).to_dict()
123        {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
124        >>> AnaField({'lencodec': 4, 'mincodec': 3, 'maxcodec': 4})
125        {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
126        
127        AnaField is created with parameters
128        >>> AnaField(lencodec=4, mincodec=3, maxcodec=4).to_dict()
129        {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}        
130        >>> AnaField(4, 3, 4).to_dict()
131        {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
132        '''
133        if isinstance(idfield, dict):
134            self.idfield = idfield.get(IDFIELD, None)
135            self.lencodec = idfield.get(LENCODEC, None)
136            self.mincodec = idfield.get(MINCODEC, None)
137            self.maxcodec = idfield.get(MAXCODEC, None)
138            self.hashf = idfield.get(HASHF, None)
139            return
140        if isinstance(idfield, (AnaField, AnaDfield)):
141            self.idfield = idfield.idfield
142            self.lencodec = idfield.lencodec
143            self.mincodec = idfield.mincodec
144            self.maxcodec = idfield.maxcodec
145            self.hashf = idfield.hashf
146            return
147        if not lencodec or not isinstance(lencodec, int):
148            raise AnaError("lencodec is not correct")
149        self.idfield = idfield
150        self.lencodec = lencodec
151        self.mincodec = mincodec
152        self.maxcodec = maxcodec
153        self.hashf = hashf

Creation mode :

  • single dict attribute where keys are attributes name,
  • single AnaField attribute to make a copy
  • multiple attributes

Parameters (multiple attributes)

  • idfield : string or integer - Id of the Field
  • lencodec : integer (default None) - length of the codec
  • mincodec : integer (default None) - number of different values
  • maxcodec : integer (default None) - length of the field
  • hashf : string (default None) - update identifier

example

AnaField is created with a dict

>>> AnaField(Cfield([1,2,3,3]).to_analysis).to_dict()
{'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
>>> AnaField({'lencodec': 4, 'mincodec': 3, 'maxcodec': 4})
{'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}

AnaField is created with parameters

>>> AnaField(lencodec=4, mincodec=3, maxcodec=4).to_dict()
{'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}        
>>> AnaField(4, 3, 4).to_dict()
{'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
def to_dict(self, full=False, idfield=False, notnone=True):
187    def to_dict(self, full=False, idfield=False, notnone=True):
188        '''return a dict with field attributes.
189
190         *Parameters*
191
192        - **full** : boolean (default False) - if True, all the attributes are included
193        - **idfield** : boolean (default False) - if True, idfield is included
194        - **notnone** : boolean (default True) - if True, None values are not included
195        '''
196        dic = {LENCODEC: self.lencodec, MINCODEC: self.mincodec,
197               MAXCODEC: self.maxcodec}
198        if idfield or full:
199            dic[IDFIELD] = self.idfield
200        if full:
201            dic |= {RATECODEC: self.ratecodec, DMINCODEC: self.dmincodec,
202                    DMAXCODEC: self.dmaxcodec, RANCODEC: self.rancodec,
203                    TYPECODEC: self.typecodec}
204        if notnone:
205            return Util.reduce_dic(dic)
206        return dic

return a dict with field attributes.

Parameters

  • full : boolean (default False) - if True, all the attributes are included
  • idfield : boolean (default False) - if True, idfield is included
  • notnone : boolean (default True) - if True, None values are not included
iscomplete

return boolean indicator : True if all attributes are present

ratecodec

return float ratecodec indicator

dmincodec

return integer dmincodec indicator

dmaxcodec

return integer dmaxcodec indicator

rancodec

return integer rancodec indicator

typecodec

return string typecodec indicator (null, unique, complete, full, default, mixed)

class AnaRelation:
255class AnaRelation:
256    '''This class analyses relationship between two fields
257
258    *Attributes* :
259
260    - **relation** : List of the two fields involved in the relationship
261    - **dist** : value of the relationship
262    - **distrib** : boolean True if values are distributed
263    - **hashr**: integer - hash value to identify update
264
265    *global (@property)*
266
267    - `id_relation`
268    - `index_relation`
269    - `parent_child`
270    - `typecoupl`
271
272    *characteristic (@property)*
273
274    - `dmax`
275    - `dmin`
276    - `diff`
277    - `dran`
278    - `distomin`
279    - `distomax`
280    - `distance`
281    - `ratecpl`
282    - `rateder`
283
284    *instance methods*
285
286    - `to_dict`
287    '''
288
289    def __init__(self, relation, dists, hashr=None):
290        '''Constructor of the relationship :
291
292         *Parameters*
293
294        - **relation** : List of the two fields involved in the relationship
295        - **dists** : dist value or list of dist value and distrib boolean
296        - **distrib** : boolean True if values are distributed
297        - **hashr**: integer - hash value to identify update
298        '''
299        self.relation = relation
300        if isinstance(dists, list):
301            self.dist = dists[0]
302            self.distrib = dists[1]
303        else:
304            self.dist = dists
305            self.distrib = None
306        self.hashr = hashr
307
308    def __repr__(self):
309        '''representation of the field (class name + idfield)'''
310        return self.__class__.__name__ + '(' + str(self.id_relation) + ')'
311
312    def __str__(self):
313        '''json-text build with the attributes dict'''
314        return json.dumps(self.to_dict(relation=True))
315
316    def __eq__(self, other):
317        ''' equal if class and values are equal'''
318        return self.__class__ .__name__ == other.__class__.__name__ and \
319            self.relation == other.relation and self.dist == other.dist and \
320            self.hashr == other.hashr and self.distrib == other.distrib
321
322    def __hash__(self):
323        '''return hash value (sum of attributes hash)'''
324        return hash(self.relation[0]) + hash(self.relation[1]) + \
325            hash(self.dist) + hash(self.hashr) + hash(self.distrib)
326
327    def to_dict(self, distances=False, full=False, mode='field', relation=False,
328                notnone=True, misc=False):
329        '''return a dict with AnaRelation attributes.
330
331         *Parameters*
332
333        - **distances** : boolean (default False) - if True, distances indicators are included
334        - **full** : boolean (default False) - if True, all the attributes are included
335        - **relation** : boolean (default False) - if True, idfield are included
336        - **notnone** : boolean (default True) - if True, None values are not included
337        - **mode** : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
338        '''
339        dic = {DIST: self.dist, TYPECOUPL: self.typecoupl, HASHR: self.hashr}
340        if relation or full:
341            dic[RELATION] = Util.view(self.relation, mode)
342            #dic[TYPECOUPL] = self.typecoupl
343            dic[PARENTCHILD] = self.parent_child
344        if distances or full:
345            dic |= {DISTANCE: self.distance, DISTOMIN: self.distomin,
346                    DISTOMAX: self.distomax, DISTRIBUTED: self.distrib,
347                    RATECPL: self.ratecpl, RATEDER: self.rateder}
348        if misc or full:
349            dic |= {DMAX: self.dmax, DMIN: self.dmin,
350                    DIFF: self.diff, DRAN: self.dran}
351        if notnone:
352            return Util.reduce_dic(dic)
353        return dic
354
355    @property
356    def id_relation(self):
357        '''return a list with the id of the two fields involved'''
358        if self.relation:
359            return [fld.idfield for fld in self.relation]
360        return []
361
362    @property
363    def parent_child(self):
364        '''returns the direction of the relationship (True if parent is first)'''
365        rel0 = self.relation[0]
366        rel1 = self.relation[1]
367        # if isinstance(rel0, AnaDfield) and isinstance(rel1, AnaDfield):
368        return (rel0.lencodec > rel1.lencodec or
369                (rel0.lencodec == rel1.lencodec and rel0.index < rel1.index))
370        # return None
371
372    @property
373    def index_relation(self):
374        '''return a list with the index of the two fields involved'''
375        if self.relation:
376            return [fld.index for fld in self.relation]
377        return []
378
379    @property
380    def dmax(self):
381        '''return integer dmax indicator'''
382        return self.relation[0].lencodec * self.relation[1].lencodec
383
384    @property
385    def dmin(self):
386        '''return integer dmin indicator'''
387        return max(self.relation[0].lencodec, self.relation[1].lencodec)
388
389    @property
390    def diff(self):
391        '''return integer diff indicator'''
392        return abs(self.relation[0].lencodec - self.relation[1].lencodec)
393
394    @property
395    def dran(self):
396        '''return integer dran indicator'''
397        return self.dmax - self.dmin
398
399    @property
400    def distomin(self):
401        '''return integer distomin indicator'''
402        return self.dist - self.dmin
403
404    @property
405    def distomax(self):
406        '''return integer distomax indicator'''
407        return self.dmax - self.dist
408
409    @property
410    def distance(self):
411        '''return integer distance indicator'''
412        return self.distomin + self.diff
413
414    @property
415    def ratecpl(self):
416        '''return float ratecpl indicator'''
417        disdis = self.distance + self.distomax
418        return 0 if disdis == 0 else self.distance / disdis
419
420    @property
421    def rateder(self):
422        '''return float rateder indicator'''
423        return 0 if self.dran == 0 else self.distomin / self.dran
424
425    @property
426    def typecoupl(self):
427        '''return relationship type (coupled, derived, crossed, linked)'''
428        if self.distance == 0:
429            return COUPLED
430        if self.distomin == 0:
431            return DERIVED
432        if self.distomax == 0:
433            return CROSSED
434        return LINKED

This class analyses relationship between two fields

Attributes :

  • relation : List of the two fields involved in the relationship
  • dist : value of the relationship
  • distrib : boolean True if values are distributed
  • hashr: integer - hash value to identify update

global (@property)

characteristic (@property)

instance methods

AnaRelation(relation, dists, hashr=None)
289    def __init__(self, relation, dists, hashr=None):
290        '''Constructor of the relationship :
291
292         *Parameters*
293
294        - **relation** : List of the two fields involved in the relationship
295        - **dists** : dist value or list of dist value and distrib boolean
296        - **distrib** : boolean True if values are distributed
297        - **hashr**: integer - hash value to identify update
298        '''
299        self.relation = relation
300        if isinstance(dists, list):
301            self.dist = dists[0]
302            self.distrib = dists[1]
303        else:
304            self.dist = dists
305            self.distrib = None
306        self.hashr = hashr

Constructor of the relationship :

Parameters

  • relation : List of the two fields involved in the relationship
  • dists : dist value or list of dist value and distrib boolean
  • distrib : boolean True if values are distributed
  • hashr: integer - hash value to identify update
def to_dict( self, distances=False, full=False, mode='field', relation=False, notnone=True, misc=False):
327    def to_dict(self, distances=False, full=False, mode='field', relation=False,
328                notnone=True, misc=False):
329        '''return a dict with AnaRelation attributes.
330
331         *Parameters*
332
333        - **distances** : boolean (default False) - if True, distances indicators are included
334        - **full** : boolean (default False) - if True, all the attributes are included
335        - **relation** : boolean (default False) - if True, idfield are included
336        - **notnone** : boolean (default True) - if True, None values are not included
337        - **mode** : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
338        '''
339        dic = {DIST: self.dist, TYPECOUPL: self.typecoupl, HASHR: self.hashr}
340        if relation or full:
341            dic[RELATION] = Util.view(self.relation, mode)
342            #dic[TYPECOUPL] = self.typecoupl
343            dic[PARENTCHILD] = self.parent_child
344        if distances or full:
345            dic |= {DISTANCE: self.distance, DISTOMIN: self.distomin,
346                    DISTOMAX: self.distomax, DISTRIBUTED: self.distrib,
347                    RATECPL: self.ratecpl, RATEDER: self.rateder}
348        if misc or full:
349            dic |= {DMAX: self.dmax, DMIN: self.dmin,
350                    DIFF: self.diff, DRAN: self.dran}
351        if notnone:
352            return Util.reduce_dic(dic)
353        return dic

return a dict with AnaRelation attributes.

Parameters

  • distances : boolean (default False) - if True, distances indicators are included
  • full : boolean (default False) - if True, all the attributes are included
  • relation : boolean (default False) - if True, idfield are included
  • notnone : boolean (default True) - if True, None values are not included
  • mode : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
id_relation

return a list with the id of the two fields involved

parent_child

returns the direction of the relationship (True if parent is first)

index_relation

return a list with the index of the two fields involved

dmax

return integer dmax indicator

dmin

return integer dmin indicator

diff

return integer diff indicator

dran

return integer dran indicator

distomin

return integer distomin indicator

distomax

return integer distomax indicator

distance

return integer distance indicator

ratecpl

return float ratecpl indicator

rateder

return float rateder indicator

typecoupl

return relationship type (coupled, derived, crossed, linked)

class AnaDfield(AnaField):
437class AnaDfield(AnaField):
438    '''This class analyses structure and relationships of fields inside a dataset
439
440    *Attributes* :
441
442    - **dataset** : AnaDataset object where AnaDfield is included
443    - **AnaField attributes** : inheritance of AnaField object
444
445    *relationship (@property)*
446
447    - `list_relations`
448    - `list_p_derived`
449    - `list_c_derived`
450    - `list_coupled`
451
452    *field (@property)*
453
454    - `fields`
455    - `p_derived`
456    - `p_distance`
457    - `p_distomin`
458
459    *global (@property)*
460
461    - `index`
462    - `dist_root`
463    - `category`
464
465    *global (instance methods)*
466
467    - `ascendants`
468    - `to_dict`
469    - `view`
470
471    *other instance methods*
472
473    - `dic_inner_node`
474    '''
475    def __new__(cls, other, dataset=None):
476        '''initialization of attributes from "other"'''
477        if isinstance(other, AnaDfield):
478            new = AnaDfield.__copy__(other)
479            return new
480        if isinstance(other, AnaField):
481            new = AnaField.__copy__(other)
482            new.__class__ = AnaDfield
483            return new
484        return object.__new__(cls)
485
486    def __init__(self, other, dataset):
487        '''AnaDfield is created by adding a AnaDataset link to an AnaField object.
488
489         *Parameters*
490
491        - **other** : AnaField or AnaDfield to initialize attributes
492        - **dataset** : AnaDataset which includes the AnaDfield
493        '''
494        self.dataset = dataset
495
496    def __copy__(self):
497        ''' Copy all the data '''
498        return self.__class__(AnaField(self), self.dataset)
499
500    def __lt__(self, other):
501        ''' return a comparison between field index'''
502        return self.index < other.index
503
504    @property
505    def index(self):
506        '''return the row of the field in the AnaDataset'''
507        if self == self.dataset.root:
508            return -1
509        return self.dataset.fields.index(self)
510
511    @property
512    def fields(self):
513        '''return the list of the fields included in the AnaDataset'''
514        return self.dataset.fields
515
516    @property
517    def list_relations(self):
518        '''return the list of the relations with the AnaDfield'''
519        return list(self.dataset.relations[self].values())
520
521    @property
522    def list_p_derived(self):
523        '''return the list of the derived relations with the parents of AnaDfield'''
524        return [rel for rel in self.list_relations if rel.typecoupl == DERIVED
525                and not rel.parent_child]
526
527    @property
528    def list_c_derived(self):
529        '''return the list of the derived relations with the childs of AnaDfield'''
530        return [rel for rel in self.list_relations if rel.typecoupl == DERIVED
531                and rel.parent_child
532                and rel.relation[1].category != UNIQUE]
533
534    @property
535    def list_coupled(self):
536        '''return the list of the coupled relations with the AnaDfield'''
537        return [rel for rel in self.list_relations if rel.typecoupl == COUPLED]
538
539    @property
540    def dist_root(self):
541        '''return the distance to the root field'''
542        return len(self.dataset) - self.lencodec
543
544    @property
545    def category(self):
546        '''return AnaDfield category (unique, rooted, coupled, derived, mixed)'''
547        if self.typecodec == UNIQUE:
548            return UNIQUE
549        if self.typecodec in (COMPLETE, FULL):
550            return ROOTED
551        if COUPLED in [rel.typecoupl for rel in self.list_relations
552                       if not rel.parent_child]:
553            return COUPLED
554        if not self.list_c_derived:
555            return DERIVED
556        return MIXED
557
558    @property
559    def p_derived(self):
560        '''return the first derived or coupled parent of the AnaDfield'''
561        if self.category in (UNIQUE, ROOTED):
562            return self.dataset.root
563        if self.category == COUPLED:
564            return [rel.relation[1] for rel in self.list_coupled
565                    if not rel.relation[1].category == COUPLED][0]
566        if not self.list_p_derived:
567            return self.dataset.root
568        distance_min = min(rel.distance for rel in self.list_p_derived)
569        for rel in self.list_p_derived:
570            if rel.distance == distance_min:
571                if rel.relation[1].category == ROOTED:
572                    return self.dataset.root
573                if rel.relation[1].category == MIXED:
574                    return rel.relation[1]
575        return self.dataset.root
576
577    @property
578    def p_distance(self):
579        '''return the first parent with minimal distance of the AnaDfield'''
580        return self._p_min_dist()
581
582    @property
583    def p_distomin(self):
584        '''return the first parent with minimal distomin of the AnaDfield'''
585        return self._p_min_dist(False)
586
587    def _p_min_dist(self, distance=True):
588        '''return the parent with minimal distance of the AnaDfield'''
589        if self.category == UNIQUE:
590            return self.dataset.root
591        if distance:
592            dist_up = [rel.distance for rel in self.list_relations if
593                       not rel.parent_child]
594            # not rel.parent_child and rel.relation[1].category != COUPLED]
595        else:
596            dist_up = [rel.distomin for rel in self.list_relations if
597                       not rel.parent_child]
598            # not rel.parent_child and rel.relation[1].category != COUPLED]
599        if not dist_up or min(dist_up) == self.dist_root:
600            return self.dataset.root
601        dist_min = min(dist_up)
602        if distance:
603            list_dmin = [rel.relation[1] for rel in self.list_relations
604                         if rel.distance == dist_min]
605            # if rel.distance == dist_min and not rel.parent_child]
606        else:
607            list_dmin = [rel.relation[1] for rel in self.list_relations
608                         if rel.distomin == dist_min]
609            # if rel.distomin == dist_min and not rel.parent_child]
610        max_lencodec = max(fld.lencodec for fld in list_dmin)
611        return [fld for fld in list_dmin if fld.lencodec == max_lencodec][0]
612
613    def to_dict(self, mode='id'):
614        '''return a dict with field attributes.
615
616         *Parameters*
617
618        - **mode** : str (default 'id') - AnaDfield representation ('field', 'id', 'index')
619        '''
620        dic = super().to_dict(full=True, notnone=False)
621        dic[DISTROOT] = self.dist_root
622        dic[NUM] = self.index
623        dic[CATEGORY] = self.category
624        dic[PDISTANCE] = self.p_distance.view(mode)
625        dic[PDISTOMIN] = self.p_distomin.view(mode)
626        dic[PDERIVED] = self.p_derived.view(mode)
627        return dic
628
629    def view(self, mode='field'):
630        ''' return a representation of the AnaDfield
631
632         *Parameters*
633
634        - **mode** : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
635        '''
636        return Util.view(self, mode)
637
638    def ascendants(self, typeparent='derived', mode='field'):
639        ''' return the list of the AnaDfield's ascendants in the family tree up to
640        the root AnaDfield.
641
642         *Parameters*
643
644        - **typeparent** : str (default 'derived') - 'derived', 'distance' or 'distomin'
645        - **mode** : str (default 'field') - AnaDfield representation
646        ('field', 'id', 'index')
647
648        *Returns* : list of parents from closest to the most distant. Parents
649        are represented with index, idfield, or object
650        '''
651        parent = self
652        listparent = []
653        while parent != self.dataset.root:
654            if typeparent == 'derived':
655                parent = parent.p_derived
656            elif typeparent == 'distance':
657                parent = parent.p_distance
658            else:
659                parent = parent.p_distomin
660            if parent != self.dataset.root:
661                listparent.append(parent)
662        return Util.view(listparent, mode)
663
664    def dic_inner_node(self, mode, lname):
665        '''return a child AnaDfield tree.
666
667         *Parameters*
668
669        - **lname** : integer - maximal length of the names
670        - **mode** : string (default 'derived') - kind of tree :
671            'derived' : derived tree
672            'distance': min distance tree
673            'distomin': min distomin tree
674
675        *Returns* : dict where key is a AnaDfield and value is the list of
676        the childs.
677        '''
678        adding = ''
679        if mode == 'distance':
680            rel_parent = self.dataset.get_relation(self, self.p_distance)
681            adding = str(rel_parent.distance) + ' - '
682        elif mode == 'distomin':
683            rel_parent = self.dataset.get_relation(self, self.p_distomin)
684            adding = str(rel_parent.distomin) + ' - '
685        elif mode == 'derived':
686            rel_parent = self.dataset.get_relation(self, self.p_derived)
687            adding = str(rel_parent.distance) + ' - '
688        adding += str(self.lencodec)
689        name = str(self.idfield)[:lname] + ' (' + adding + ')'
690        lis = [name.replace(' ', '*').replace("'", '*')]
691        if mode == 'derived':
692            childs = []
693            #if not self.category in (ROOTED, COUPLED):
694            if not self.category in (ROOTED, COUPLED, UNIQUE):
695                for rel in self.list_coupled:
696                    lis.append(rel.relation[1].dic_inner_node(mode, lname))
697            if not self.category in (ROOTED, UNIQUE):
698                childs = [rel.relation[1] for rel in self.list_relations
699                          if rel.relation[1].p_derived == self and
700                          rel.relation[1].category != COUPLED]
701        if mode == 'distomin':
702            childs = [rel.relation[1] for rel in self.list_relations
703                      if rel.relation[1].p_distomin == self]
704        if mode == 'distance':
705            childs = [rel.relation[1] for rel in self.list_relations
706                      if rel.relation[1].p_distance == self]
707        for fld in childs:
708            lis.append(fld.dic_inner_node(mode, lname))
709        return {str(self.index).ljust(2, '*'): lis}

This class analyses structure and relationships of fields inside a dataset

Attributes :

  • dataset : AnaDataset object where AnaDfield is included
  • AnaField attributes : inheritance of AnaField object

relationship (@property)

field (@property)

global (@property)

global (instance methods)

other instance methods

AnaDfield(other, dataset)
486    def __init__(self, other, dataset):
487        '''AnaDfield is created by adding a AnaDataset link to an AnaField object.
488
489         *Parameters*
490
491        - **other** : AnaField or AnaDfield to initialize attributes
492        - **dataset** : AnaDataset which includes the AnaDfield
493        '''
494        self.dataset = dataset

AnaDfield is created by adding a AnaDataset link to an AnaField object.

Parameters

  • other : AnaField or AnaDfield to initialize attributes
  • dataset : AnaDataset which includes the AnaDfield
index

return the row of the field in the AnaDataset

fields

return the list of the fields included in the AnaDataset

list_relations

return the list of the relations with the AnaDfield

list_p_derived

return the list of the derived relations with the parents of AnaDfield

list_c_derived

return the list of the derived relations with the childs of AnaDfield

list_coupled

return the list of the coupled relations with the AnaDfield

dist_root

return the distance to the root field

category

return AnaDfield category (unique, rooted, coupled, derived, mixed)

p_derived

return the first derived or coupled parent of the AnaDfield

p_distance

return the first parent with minimal distance of the AnaDfield

p_distomin

return the first parent with minimal distomin of the AnaDfield

def to_dict(self, mode='id'):
613    def to_dict(self, mode='id'):
614        '''return a dict with field attributes.
615
616         *Parameters*
617
618        - **mode** : str (default 'id') - AnaDfield representation ('field', 'id', 'index')
619        '''
620        dic = super().to_dict(full=True, notnone=False)
621        dic[DISTROOT] = self.dist_root
622        dic[NUM] = self.index
623        dic[CATEGORY] = self.category
624        dic[PDISTANCE] = self.p_distance.view(mode)
625        dic[PDISTOMIN] = self.p_distomin.view(mode)
626        dic[PDERIVED] = self.p_derived.view(mode)
627        return dic

return a dict with field attributes.

Parameters

  • mode : str (default 'id') - AnaDfield representation ('field', 'id', 'index')
def view(self, mode='field'):
629    def view(self, mode='field'):
630        ''' return a representation of the AnaDfield
631
632         *Parameters*
633
634        - **mode** : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
635        '''
636        return Util.view(self, mode)

return a representation of the AnaDfield

Parameters

  • mode : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
def ascendants(self, typeparent='derived', mode='field'):
638    def ascendants(self, typeparent='derived', mode='field'):
639        ''' return the list of the AnaDfield's ascendants in the family tree up to
640        the root AnaDfield.
641
642         *Parameters*
643
644        - **typeparent** : str (default 'derived') - 'derived', 'distance' or 'distomin'
645        - **mode** : str (default 'field') - AnaDfield representation
646        ('field', 'id', 'index')
647
648        *Returns* : list of parents from closest to the most distant. Parents
649        are represented with index, idfield, or object
650        '''
651        parent = self
652        listparent = []
653        while parent != self.dataset.root:
654            if typeparent == 'derived':
655                parent = parent.p_derived
656            elif typeparent == 'distance':
657                parent = parent.p_distance
658            else:
659                parent = parent.p_distomin
660            if parent != self.dataset.root:
661                listparent.append(parent)
662        return Util.view(listparent, mode)

return the list of the AnaDfield's ascendants in the family tree up to the root AnaDfield.

Parameters

  • typeparent : str (default 'derived') - 'derived', 'distance' or 'distomin'
  • mode : str (default 'field') - AnaDfield representation ('field', 'id', 'index')

Returns : list of parents from closest to the most distant. Parents are represented with index, idfield, or object

def dic_inner_node(self, mode, lname):
664    def dic_inner_node(self, mode, lname):
665        '''return a child AnaDfield tree.
666
667         *Parameters*
668
669        - **lname** : integer - maximal length of the names
670        - **mode** : string (default 'derived') - kind of tree :
671            'derived' : derived tree
672            'distance': min distance tree
673            'distomin': min distomin tree
674
675        *Returns* : dict where key is a AnaDfield and value is the list of
676        the childs.
677        '''
678        adding = ''
679        if mode == 'distance':
680            rel_parent = self.dataset.get_relation(self, self.p_distance)
681            adding = str(rel_parent.distance) + ' - '
682        elif mode == 'distomin':
683            rel_parent = self.dataset.get_relation(self, self.p_distomin)
684            adding = str(rel_parent.distomin) + ' - '
685        elif mode == 'derived':
686            rel_parent = self.dataset.get_relation(self, self.p_derived)
687            adding = str(rel_parent.distance) + ' - '
688        adding += str(self.lencodec)
689        name = str(self.idfield)[:lname] + ' (' + adding + ')'
690        lis = [name.replace(' ', '*').replace("'", '*')]
691        if mode == 'derived':
692            childs = []
693            #if not self.category in (ROOTED, COUPLED):
694            if not self.category in (ROOTED, COUPLED, UNIQUE):
695                for rel in self.list_coupled:
696                    lis.append(rel.relation[1].dic_inner_node(mode, lname))
697            if not self.category in (ROOTED, UNIQUE):
698                childs = [rel.relation[1] for rel in self.list_relations
699                          if rel.relation[1].p_derived == self and
700                          rel.relation[1].category != COUPLED]
701        if mode == 'distomin':
702            childs = [rel.relation[1] for rel in self.list_relations
703                      if rel.relation[1].p_distomin == self]
704        if mode == 'distance':
705            childs = [rel.relation[1] for rel in self.list_relations
706                      if rel.relation[1].p_distance == self]
707        for fld in childs:
708            lis.append(fld.dic_inner_node(mode, lname))
709        return {str(self.index).ljust(2, '*'): lis}

return a child AnaDfield tree.

Parameters

  • lname : integer - maximal length of the names
  • mode : string (default 'derived') - kind of tree : 'derived' : derived tree 'distance': min distance tree 'distomin': min distomin tree

Returns : dict where key is a AnaDfield and value is the list of the childs.

class AnaDataset:
 712class AnaDataset:
 713    '''This class analyses the structure of a dataset.
 714
 715    *Attributes* :
 716
 717    - **iddataset** : string or integer - Id of the Dataset
 718    - **fields** : list of the AnaDfields included
 719    - **relations** : dict of the AnaRelations between two AnaDfields
 720    - **hashd** : string - update identifier
 721
 722    *relationship (@property)*
 723
 724    - `ana_relations`
 725    - `p_relations`
 726
 727    *field (@property)*
 728
 729    - `root`
 730    - `primary`
 731    - `secondary`
 732    - `unique`
 733    - `variable`
 734
 735    *global (@property)*
 736
 737    - `category`
 738    - `complete`
 739    - `dimension`
 740
 741    *update (instance methods)*
 742
 743    - `set_relations`
 744
 745
 746    *access (instance methods)*
 747
 748    - `get_relation`
 749    - `dfield`
 750
 751    *synthesis (instance methods)*
 752
 753    - `tree`
 754    - `to_dict`
 755    - `indicator`
 756    - `partitions`
 757    - `field_partition`
 758    '''
 759
 760    def __init__(self, fields=None, relations=None, iddataset=None,
 761                 leng=None, hashd=None):
 762        '''Creation mode :
 763        - single dict attribute where keys are attributes name,
 764        - single AnaDataset attribute to make a copy
 765        - multiple attributes
 766
 767         *Parameters (multiple attributes)*
 768
 769        - **idfield** : string or integer - Id of the Field
 770        - **lencodec** : integer (default None) - length of the codec
 771        - **mincodec** : integer (default None) - number of different values
 772        - **maxcodec** : integer (default None) - length of the field
 773        - **hashf** : string (default None) - update identifier
 774        '''
 775        if isinstance(fields, AnaDataset):
 776            self.iddataset = fields.iddataset
 777            self.fields = fields.fields
 778            self.relations = fields.relations
 779            self.hashd = fields.hashd
 780            return
 781        if isinstance(fields, dict):
 782            iddataset = fields.get(IDDATASET, None)
 783            leng = fields.get(LENGTH, None)
 784            relations = fields.get(RELATIONS, None)
 785            hashd = fields.get(HASHD)
 786            fields = fields.get(FIELDS, None)
 787        self.iddataset = iddataset
 788        self.fields = [AnaDfield(AnaField(field), self)
 789                       for field in fields] if fields else []
 790        if leng:
 791            for fld in self.fields:
 792                fld.maxcodec = leng
 793        self.relations = {field: {} for field in self.fields}
 794        if relations:
 795            for fld, dic_relation in relations.items():
 796                self.set_relations(fld, dic_relation)
 797        self.hashd = hashd
 798
 799    def __len__(self):
 800        '''length of the AnaDataset (len of the AnaDfields included)'''
 801        return max(len(fld) for fld in self.fields)
 802
 803    def __eq__(self, other):
 804        ''' equal if class and values are equal'''
 805        return self.__class__ .__name__ == other.__class__.__name__ and \
 806            self.fields == other.fields and self.relations == other.relations and \
 807            self.iddataset == other.iddataset and self.hashd == other.hashd
 808
 809    def __hash__(self):
 810        '''return hash value (sum of attributes hash)'''
 811        return hash(self.iddataset) + sum(hash(fld) for fld in self.fields) + \
 812            sum(hash(rel) for rel in self.relations) + hash(self.hashd)
 813
 814    @property
 815    def category(self):
 816        '''return a list of AnaDfield category (unique, rooted, coupled, derived, mixed)'''
 817        return [fld.category for fld in self.fields]
 818
 819    @property
 820    def ana_relations(self):
 821        '''return the list of AnaRelation included'''
 822        return [rel for fldrel in self.relations.values() for rel in fldrel.values()]
 823
 824    @property
 825    def p_relations(self):
 826        '''return the list of oriented AnaRelation (parent first, child second)'''
 827        return [rel for rel in self.ana_relations if rel.parent_child]
 828
 829    @property
 830    def root(self):
 831        '''return the root AnaDfield'''
 832        len_self = len(self)
 833        return AnaDfield(AnaField(ROOT, len_self, len_self, len_self), self)
 834
 835    @property
 836    def primary(self):
 837        '''return the first partition of the partitions'''
 838        part = self.partitions(distributed=True)
 839        return part[0] if part else []
 840
 841    @property
 842    def complete(self):
 843        '''return True if the dimension is not 0'''
 844        return self.dimension > 0
 845
 846    @property
 847    def dimension(self):
 848        '''return the highest partition lenght'''
 849        return len(self.primary)
 850
 851    @property
 852    def secondary(self):
 853        '''return the derived ou coupled fields from primary'''
 854        secondary = []
 855        for field in self.primary:
 856            self._add_child(field, secondary)
 857        return [fld for fld in secondary if not fld in self.primary]
 858
 859    @property
 860    def unique(self):
 861        '''return the unique fields'''
 862        return [fld for fld in self.fields if fld.category == UNIQUE]
 863
 864    @property
 865    def variable(self):
 866        '''return the variable fields'''
 867        return [fld for fld in self.fields
 868                if not fld in self.primary + self.secondary + self.unique]
 869
 870    def set_relations(self, field, dic_relations):
 871        '''Add relations in the AnaDataset from a dict.
 872
 873         *Parameters*
 874
 875        - **field** : AnaDfield, AnaField or str (idfield) - first relation AnaDfield
 876        - **dic_relations** : dict - key is the second relation AnaDfield and
 877        value is the dist value or teh list [dist, distrib]
 878        '''
 879        fld = self.dfield(field)
 880        for other, dist in dic_relations.items():
 881            oth = self.dfield(other)
 882            self.relations[fld][oth] = AnaRelation([fld, oth], dist)
 883            self.relations[oth][fld] = AnaRelation([oth, fld], dist)
 884
 885    def get_relation(self, fld1, fld2):
 886        '''Return AnaRelation between fld1 and fld2.
 887
 888         *Parameters*
 889
 890        - **fld1** : AnaDfield, AnaField, int or str (idfield) - first relation AnaDfield
 891        - **fld2** : AnaDfield, AnaField, int or str (idfield) - second relation AnaDfield
 892        '''
 893        fl1 = self.dfield(fld1)
 894        fl2 = self.dfield(fld2)
 895        if self.root in [fl1, fl2]:
 896            return AnaRelation([fl1, fl2], len(self))
 897        return self.relations[self.dfield(fld1)][self.dfield(fld2)]
 898
 899    def dfield(self, fld):
 900        '''return the AnaDfield matching with fld. Fld is str, int, AnaDfield or AnaField'''
 901        if fld in (-1, ROOT):
 902            return self.root
 903        if isinstance(fld, AnaDfield):
 904            return fld
 905        if isinstance(fld, int):
 906            return self.fields[fld]
 907        if isinstance(fld, str):
 908            if fld in [dfld.idfield for dfld in self.fields]:
 909                return [dfld for dfld in self.fields if dfld.idfield == fld][0]
 910            # return self.root
 911            return None
 912        return AnaDfield(fld, self)
 913
 914    def tree(self, mode='derived', width=5, lname=20, string=True):
 915        '''return a string with a tree of derived Field.
 916
 917         *Parameters*
 918
 919        - **lname** : integer (default 20) - length of the names
 920        - **width** : integer (default 5) - length of the lines
 921        - **string** : boolean (default True) - if True return str else return dict
 922        - **mode** : string (default 'derived') - kind of tree :
 923            'derived' : derived tree
 924            'distance': min distance tree
 925            'distomin': min distomin tree
 926        '''
 927        lis = ['root-' + mode + '*(' + str(len(self)) + ')']
 928        if mode == 'distance':
 929            childs = [fld for fld in self.fields if fld.p_distance == self.root]
 930        elif mode == 'distomin':
 931            childs = [fld for fld in self.fields if fld.p_distomin == self.root]
 932        elif mode == 'derived':
 933            childs = [fld for fld in self.fields if fld.p_derived == self.root]
 934        for fld in childs:
 935            lis.append(fld.dic_inner_node(mode, lname))
 936        tree = {str(-1).ljust(2, '*'): lis}
 937        if string:
 938            tre = pprint.pformat(tree, indent=0, width=width)
 939            tre = tre.replace('---', ' - ')
 940            tre = tre.replace('  ', ' ')
 941            tre = tre.replace('*', ' ')
 942            for car in ["'", "\"", "{", "[", "]", "}", ","]:
 943                tre = tre.replace(car, "")
 944            return tre
 945        return Util.clean_dic(tree, '*', ' ')
 946
 947    def to_dict(self, mode='field', keys=None, relations=False):
 948        '''return a dict with fields attributes and optionaly relations attributes.
 949
 950         *Parameters*
 951
 952        - **mode** : str (default 'field') - AnaDfield representation
 953        ('field', 'id', 'index')
 954        - **relations** : boolean (default: False) - if False return a list of fields,
 955        if True return a dict '{"fields": <list of fields>, "relations": <list of relations>}'
 956        - **keys** : string, list or tuple - list of keys or single key to return
 957        if 'all' or None, all keys are returned
 958        if list, only keys in list are returned
 959        if string, only values associated to the string(key) are returned'''
 960        fields = Util.filter_dic([fld.to_dict(mode=mode)
 961                                 for fld in self.fields], keys)
 962        leng = len(self.fields)
 963        if not relations:
 964            return fields
 965        return {'fields': fields, 'relations':
 966                [self.get_relation(i, j).to_dict(full=True, mode=mode)
 967                 for i in range(-1, leng) for j in range(i + 1, leng)]}
 968
 969    def partitions(self, mode='field', distributed=True):
 970        '''return a list of available partitions (the first is highest).
 971
 972         *Parameters*
 973
 974        - **mode** : str (default 'field') - AnaDfield representation
 975        ('field', 'id', 'index')
 976        - **distributed** : boolean (default True) - Include only distributed fields
 977        '''
 978        partit = [[fld] for fld in self.fields if fld.category == ROOTED]
 979        crossed = [rel for rel in self.ana_relations if rel.typecoupl == CROSSED
 980                   # and rel.relation[1].index > rel.relation[0].index
 981                   and rel.parent_child
 982                   and rel.relation[0].category != COUPLED
 983                   and rel.relation[1].category != COUPLED]
 984        if distributed:
 985            crossed = [rel for rel in crossed if rel.distrib]
 986        if crossed and len(crossed) == 1 and crossed[0].dist == len(self):
 987            partit.insert(0, crossed[0].relation)
 988        elif crossed:
 989            for repeat in list(range(len(crossed))):
 990                candidates = combinations(crossed, repeat + 1)
 991                for candidat in candidates:
 992                    flds = list(set(rel.relation[i]
 993                                for rel in candidat for i in [0, 1]))
 994                    if (reduce(mul, [fld.lencodec for fld in flds]) == len(self) and
 995                        len(candidat) == sum(range(len(flds))) and
 996                            (not distributed or min(rel.distrib for rel in candidat))):
 997                        partit.insert(0, flds)
 998        partit = Util.view(partit, mode)
 999        return [list(tup) for tup in
1000                sorted(sorted(list({tuple(sorted(prt)) for prt in partit})),
1001                       key=len, reverse=True)]
1002
1003    def field_partition(self, mode='field', partition=None, distributed=True):
1004        '''return a partition dict with the list of primary, secondary, unique
1005        and variable fields.
1006
1007         *Parameters*
1008
1009        - **mode** : str (default 'field') - AnaDfield representation
1010        ('field', 'id', 'index')
1011        - **partition** : list (default None) - if None, partition is the first
1012        - **distributed** : boolean (default True) - Include only distributed fields
1013        '''
1014        if not partition:
1015            partitions = self.partitions(distributed=distributed)
1016            if not partitions:
1017                return {'primary': [], 'secondary': [], 'unique': [], 'variable': []}
1018            partition = partitions[0]
1019        else:
1020            partition = [self.dfield(fld) for fld in partition]
1021        secondary = []
1022        for field in partition:
1023            self._add_child(field, secondary)
1024        secondary = [fld for fld in secondary if not fld in partition]
1025        unique = [fld for fld in self.fields if fld.category == UNIQUE]
1026        variable = [fld for fld in self.fields
1027                    if not fld in partition + secondary + unique]
1028        return Util.view({'primary': partition, 'secondary': secondary,
1029                          'unique': unique, 'variable': variable}, mode)
1030
1031    def indicator(self, fullsize, size):
1032        '''generate size indicators: ol (object lightness), ul (unicity level),
1033        gain (sizegain)
1034
1035        *Parameters*
1036
1037        - **fullsize** : int - size with full codec
1038        - **size** : int - size with existing codec
1039
1040        *Returns* : dict'''
1041        lenindex = len(self.fields)
1042        indexlen = sum(fld.lencodec for fld in self.fields)
1043        nval = len(self) * (lenindex + 1)
1044        sval = fullsize / nval
1045        ncod = indexlen + lenindex
1046
1047        if nval != ncod:
1048            scod = (size - ncod * sval) / (nval - ncod)
1049            olight = scod / sval
1050        else:
1051            olight = None
1052        return {'total values': nval, 'mean size': round(sval, 3),
1053                'unique values': ncod, 'mean coding size': round(scod, 3),
1054                'unicity level': round(ncod / nval, 3),
1055                'optimize level': round(size / fullsize, 3),
1056                'object lightness': round(olight, 3),
1057                'maxgain': round((nval - ncod) / nval, 3),
1058                'gain': round((fullsize - size) / fullsize, 3)}
1059
1060    def _add_child(self, field, childs):
1061        ''' add derived or coupled fields in the childs list'''
1062        for rel in field.list_c_derived + field.list_coupled:
1063            child = rel.relation[1]
1064            if not child in childs and not child.category == UNIQUE:
1065                childs.append(child)
1066                if not child.category in (COUPLED, UNIQUE):
1067                    self._add_child(child, childs)

This class analyses the structure of a dataset.

Attributes :

  • iddataset : string or integer - Id of the Dataset
  • fields : list of the AnaDfields included
  • relations : dict of the AnaRelations between two AnaDfields
  • hashd : string - update identifier

relationship (@property)

field (@property)

global (@property)

update (instance methods)

access (instance methods)

synthesis (instance methods)

AnaDataset(fields=None, relations=None, iddataset=None, leng=None, hashd=None)
760    def __init__(self, fields=None, relations=None, iddataset=None,
761                 leng=None, hashd=None):
762        '''Creation mode :
763        - single dict attribute where keys are attributes name,
764        - single AnaDataset attribute to make a copy
765        - multiple attributes
766
767         *Parameters (multiple attributes)*
768
769        - **idfield** : string or integer - Id of the Field
770        - **lencodec** : integer (default None) - length of the codec
771        - **mincodec** : integer (default None) - number of different values
772        - **maxcodec** : integer (default None) - length of the field
773        - **hashf** : string (default None) - update identifier
774        '''
775        if isinstance(fields, AnaDataset):
776            self.iddataset = fields.iddataset
777            self.fields = fields.fields
778            self.relations = fields.relations
779            self.hashd = fields.hashd
780            return
781        if isinstance(fields, dict):
782            iddataset = fields.get(IDDATASET, None)
783            leng = fields.get(LENGTH, None)
784            relations = fields.get(RELATIONS, None)
785            hashd = fields.get(HASHD)
786            fields = fields.get(FIELDS, None)
787        self.iddataset = iddataset
788        self.fields = [AnaDfield(AnaField(field), self)
789                       for field in fields] if fields else []
790        if leng:
791            for fld in self.fields:
792                fld.maxcodec = leng
793        self.relations = {field: {} for field in self.fields}
794        if relations:
795            for fld, dic_relation in relations.items():
796                self.set_relations(fld, dic_relation)
797        self.hashd = hashd

Creation mode :

  • single dict attribute where keys are attributes name,
  • single AnaDataset attribute to make a copy
  • multiple attributes

Parameters (multiple attributes)

  • idfield : string or integer - Id of the Field
  • lencodec : integer (default None) - length of the codec
  • mincodec : integer (default None) - number of different values
  • maxcodec : integer (default None) - length of the field
  • hashf : string (default None) - update identifier
category

return a list of AnaDfield category (unique, rooted, coupled, derived, mixed)

ana_relations

return the list of AnaRelation included

p_relations

return the list of oriented AnaRelation (parent first, child second)

root

return the root AnaDfield

primary

return the first partition of the partitions

complete

return True if the dimension is not 0

dimension

return the highest partition lenght

secondary

return the derived ou coupled fields from primary

unique

return the unique fields

variable

return the variable fields

def set_relations(self, field, dic_relations):
870    def set_relations(self, field, dic_relations):
871        '''Add relations in the AnaDataset from a dict.
872
873         *Parameters*
874
875        - **field** : AnaDfield, AnaField or str (idfield) - first relation AnaDfield
876        - **dic_relations** : dict - key is the second relation AnaDfield and
877        value is the dist value or teh list [dist, distrib]
878        '''
879        fld = self.dfield(field)
880        for other, dist in dic_relations.items():
881            oth = self.dfield(other)
882            self.relations[fld][oth] = AnaRelation([fld, oth], dist)
883            self.relations[oth][fld] = AnaRelation([oth, fld], dist)

Add relations in the AnaDataset from a dict.

Parameters

  • field : AnaDfield, AnaField or str (idfield) - first relation AnaDfield
  • dic_relations : dict - key is the second relation AnaDfield and value is the dist value or teh list [dist, distrib]
def get_relation(self, fld1, fld2):
885    def get_relation(self, fld1, fld2):
886        '''Return AnaRelation between fld1 and fld2.
887
888         *Parameters*
889
890        - **fld1** : AnaDfield, AnaField, int or str (idfield) - first relation AnaDfield
891        - **fld2** : AnaDfield, AnaField, int or str (idfield) - second relation AnaDfield
892        '''
893        fl1 = self.dfield(fld1)
894        fl2 = self.dfield(fld2)
895        if self.root in [fl1, fl2]:
896            return AnaRelation([fl1, fl2], len(self))
897        return self.relations[self.dfield(fld1)][self.dfield(fld2)]

Return AnaRelation between fld1 and fld2.

Parameters

  • fld1 : AnaDfield, AnaField, int or str (idfield) - first relation AnaDfield
  • fld2 : AnaDfield, AnaField, int or str (idfield) - second relation AnaDfield
def dfield(self, fld):
899    def dfield(self, fld):
900        '''return the AnaDfield matching with fld. Fld is str, int, AnaDfield or AnaField'''
901        if fld in (-1, ROOT):
902            return self.root
903        if isinstance(fld, AnaDfield):
904            return fld
905        if isinstance(fld, int):
906            return self.fields[fld]
907        if isinstance(fld, str):
908            if fld in [dfld.idfield for dfld in self.fields]:
909                return [dfld for dfld in self.fields if dfld.idfield == fld][0]
910            # return self.root
911            return None
912        return AnaDfield(fld, self)

return the AnaDfield matching with fld. Fld is str, int, AnaDfield or AnaField

def tree(self, mode='derived', width=5, lname=20, string=True):
914    def tree(self, mode='derived', width=5, lname=20, string=True):
915        '''return a string with a tree of derived Field.
916
917         *Parameters*
918
919        - **lname** : integer (default 20) - length of the names
920        - **width** : integer (default 5) - length of the lines
921        - **string** : boolean (default True) - if True return str else return dict
922        - **mode** : string (default 'derived') - kind of tree :
923            'derived' : derived tree
924            'distance': min distance tree
925            'distomin': min distomin tree
926        '''
927        lis = ['root-' + mode + '*(' + str(len(self)) + ')']
928        if mode == 'distance':
929            childs = [fld for fld in self.fields if fld.p_distance == self.root]
930        elif mode == 'distomin':
931            childs = [fld for fld in self.fields if fld.p_distomin == self.root]
932        elif mode == 'derived':
933            childs = [fld for fld in self.fields if fld.p_derived == self.root]
934        for fld in childs:
935            lis.append(fld.dic_inner_node(mode, lname))
936        tree = {str(-1).ljust(2, '*'): lis}
937        if string:
938            tre = pprint.pformat(tree, indent=0, width=width)
939            tre = tre.replace('---', ' - ')
940            tre = tre.replace('  ', ' ')
941            tre = tre.replace('*', ' ')
942            for car in ["'", "\"", "{", "[", "]", "}", ","]:
943                tre = tre.replace(car, "")
944            return tre
945        return Util.clean_dic(tree, '*', ' ')

return a string with a tree of derived Field.

Parameters

  • lname : integer (default 20) - length of the names
  • width : integer (default 5) - length of the lines
  • string : boolean (default True) - if True return str else return dict
  • mode : string (default 'derived') - kind of tree : 'derived' : derived tree 'distance': min distance tree 'distomin': min distomin tree
def to_dict(self, mode='field', keys=None, relations=False):
947    def to_dict(self, mode='field', keys=None, relations=False):
948        '''return a dict with fields attributes and optionaly relations attributes.
949
950         *Parameters*
951
952        - **mode** : str (default 'field') - AnaDfield representation
953        ('field', 'id', 'index')
954        - **relations** : boolean (default: False) - if False return a list of fields,
955        if True return a dict '{"fields": <list of fields>, "relations": <list of relations>}'
956        - **keys** : string, list or tuple - list of keys or single key to return
957        if 'all' or None, all keys are returned
958        if list, only keys in list are returned
959        if string, only values associated to the string(key) are returned'''
960        fields = Util.filter_dic([fld.to_dict(mode=mode)
961                                 for fld in self.fields], keys)
962        leng = len(self.fields)
963        if not relations:
964            return fields
965        return {'fields': fields, 'relations':
966                [self.get_relation(i, j).to_dict(full=True, mode=mode)
967                 for i in range(-1, leng) for j in range(i + 1, leng)]}

return a dict with fields attributes and optionaly relations attributes.

Parameters

  • mode : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
  • relations : boolean (default: False) - if False return a list of fields, if True return a dict '{"fields": , "relations": }'
  • keys : string, list or tuple - list of keys or single key to return if 'all' or None, all keys are returned if list, only keys in list are returned if string, only values associated to the string(key) are returned
def partitions(self, mode='field', distributed=True):
 969    def partitions(self, mode='field', distributed=True):
 970        '''return a list of available partitions (the first is highest).
 971
 972         *Parameters*
 973
 974        - **mode** : str (default 'field') - AnaDfield representation
 975        ('field', 'id', 'index')
 976        - **distributed** : boolean (default True) - Include only distributed fields
 977        '''
 978        partit = [[fld] for fld in self.fields if fld.category == ROOTED]
 979        crossed = [rel for rel in self.ana_relations if rel.typecoupl == CROSSED
 980                   # and rel.relation[1].index > rel.relation[0].index
 981                   and rel.parent_child
 982                   and rel.relation[0].category != COUPLED
 983                   and rel.relation[1].category != COUPLED]
 984        if distributed:
 985            crossed = [rel for rel in crossed if rel.distrib]
 986        if crossed and len(crossed) == 1 and crossed[0].dist == len(self):
 987            partit.insert(0, crossed[0].relation)
 988        elif crossed:
 989            for repeat in list(range(len(crossed))):
 990                candidates = combinations(crossed, repeat + 1)
 991                for candidat in candidates:
 992                    flds = list(set(rel.relation[i]
 993                                for rel in candidat for i in [0, 1]))
 994                    if (reduce(mul, [fld.lencodec for fld in flds]) == len(self) and
 995                        len(candidat) == sum(range(len(flds))) and
 996                            (not distributed or min(rel.distrib for rel in candidat))):
 997                        partit.insert(0, flds)
 998        partit = Util.view(partit, mode)
 999        return [list(tup) for tup in
1000                sorted(sorted(list({tuple(sorted(prt)) for prt in partit})),
1001                       key=len, reverse=True)]

return a list of available partitions (the first is highest).

Parameters

  • mode : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
  • distributed : boolean (default True) - Include only distributed fields
def field_partition(self, mode='field', partition=None, distributed=True):
1003    def field_partition(self, mode='field', partition=None, distributed=True):
1004        '''return a partition dict with the list of primary, secondary, unique
1005        and variable fields.
1006
1007         *Parameters*
1008
1009        - **mode** : str (default 'field') - AnaDfield representation
1010        ('field', 'id', 'index')
1011        - **partition** : list (default None) - if None, partition is the first
1012        - **distributed** : boolean (default True) - Include only distributed fields
1013        '''
1014        if not partition:
1015            partitions = self.partitions(distributed=distributed)
1016            if not partitions:
1017                return {'primary': [], 'secondary': [], 'unique': [], 'variable': []}
1018            partition = partitions[0]
1019        else:
1020            partition = [self.dfield(fld) for fld in partition]
1021        secondary = []
1022        for field in partition:
1023            self._add_child(field, secondary)
1024        secondary = [fld for fld in secondary if not fld in partition]
1025        unique = [fld for fld in self.fields if fld.category == UNIQUE]
1026        variable = [fld for fld in self.fields
1027                    if not fld in partition + secondary + unique]
1028        return Util.view({'primary': partition, 'secondary': secondary,
1029                          'unique': unique, 'variable': variable}, mode)

return a partition dict with the list of primary, secondary, unique and variable fields.

Parameters

  • mode : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
  • partition : list (default None) - if None, partition is the first
  • distributed : boolean (default True) - Include only distributed fields
def indicator(self, fullsize, size):
1031    def indicator(self, fullsize, size):
1032        '''generate size indicators: ol (object lightness), ul (unicity level),
1033        gain (sizegain)
1034
1035        *Parameters*
1036
1037        - **fullsize** : int - size with full codec
1038        - **size** : int - size with existing codec
1039
1040        *Returns* : dict'''
1041        lenindex = len(self.fields)
1042        indexlen = sum(fld.lencodec for fld in self.fields)
1043        nval = len(self) * (lenindex + 1)
1044        sval = fullsize / nval
1045        ncod = indexlen + lenindex
1046
1047        if nval != ncod:
1048            scod = (size - ncod * sval) / (nval - ncod)
1049            olight = scod / sval
1050        else:
1051            olight = None
1052        return {'total values': nval, 'mean size': round(sval, 3),
1053                'unique values': ncod, 'mean coding size': round(scod, 3),
1054                'unicity level': round(ncod / nval, 3),
1055                'optimize level': round(size / fullsize, 3),
1056                'object lightness': round(olight, 3),
1057                'maxgain': round((nval - ncod) / nval, 3),
1058                'gain': round((fullsize - size) / fullsize, 3)}

generate size indicators: ol (object lightness), ul (unicity level), gain (sizegain)

Parameters

  • fullsize : int - size with full codec
  • size : int - size with existing codec

Returns : dict

class Util:
1070class Util:
1071    ''' common functions for analysis package'''
1072
1073    @staticmethod
1074    def view(field_struc, mode):
1075        ''' return a representation of a AnaDfields structure (fields, id, index).
1076
1077         *Parameters*
1078
1079        - **mode** : str - AnaDfield representation ('field', 'id', 'index')
1080        - **field_struc** : list or dict - structure to represent
1081        '''
1082        if mode is None or mode == 'field' or not field_struc:
1083            return field_struc
1084        if isinstance(field_struc, dict):
1085            return {key: [fld.idfield if mode == 'id' else fld.index for fld in val]
1086                    for key, val in field_struc.items()}
1087        if isinstance(field_struc, list) and isinstance(field_struc[0], list):
1088            return [[fld.idfield if mode == 'id' else fld.index for fld in val]
1089                    for val in field_struc]
1090        if isinstance(field_struc, list):
1091            return [fld.idfield if mode == 'id' else fld.index for fld in field_struc]
1092        if isinstance(field_struc, AnaField):
1093            return field_struc.idfield if mode == 'id' else field_struc.index
1094        return field_struc
1095
1096    @staticmethod
1097    def reduce_dic(obj):
1098        '''return a dict without None values'''
1099        if isinstance(obj, dict):
1100            return {key: Util.reduce_dic(val) for key, val in obj.items() 
1101                    if not val is None}
1102        if isinstance(obj, list):
1103            return [Util.reduce_dic(val) for val in obj]
1104        return obj
1105    
1106    @staticmethod
1107    def clean_dic(obj, old, new):
1108        '''return a dict or list with updated strings by replacing "old" substring
1109        with "new" substring'''
1110        if isinstance(obj, dict):
1111            return {Util.clean_dic(key, old, new): Util.clean_dic(val, old, new)
1112                    for key, val in obj.items()}
1113        if isinstance(obj, str):
1114            return obj.replace(old, new)
1115        if isinstance(obj, list):
1116            return [Util.clean_dic(val, old, new) for val in obj]
1117        return obj
1118
1119    @staticmethod
1120    def filter_dic(obj, keys):
1121        '''return extract of a list of dict or of a dict
1122
1123         *Parameters*
1124
1125        - **keys** : string, list or tuple - list of keys or single key to return
1126        if 'all' or None, all keys are returned
1127        if list, only keys in list are returned
1128        if string, only values associated to the string(key) are returned'''
1129        if not keys or keys == 'all':
1130            return obj
1131        if isinstance(obj, list):
1132            return [Util.filter_dic(dic, keys) for dic in obj]
1133        if isinstance(keys, str) and isinstance(obj, dict):
1134            return obj.get(keys, None)
1135        if isinstance(keys, (list, tuple)) and isinstance(obj, dict):
1136            return {key: val for key, val in obj.items() if key in keys}
1137        return obj

common functions for analysis package

@staticmethod
def view(field_struc, mode):
1073    @staticmethod
1074    def view(field_struc, mode):
1075        ''' return a representation of a AnaDfields structure (fields, id, index).
1076
1077         *Parameters*
1078
1079        - **mode** : str - AnaDfield representation ('field', 'id', 'index')
1080        - **field_struc** : list or dict - structure to represent
1081        '''
1082        if mode is None or mode == 'field' or not field_struc:
1083            return field_struc
1084        if isinstance(field_struc, dict):
1085            return {key: [fld.idfield if mode == 'id' else fld.index for fld in val]
1086                    for key, val in field_struc.items()}
1087        if isinstance(field_struc, list) and isinstance(field_struc[0], list):
1088            return [[fld.idfield if mode == 'id' else fld.index for fld in val]
1089                    for val in field_struc]
1090        if isinstance(field_struc, list):
1091            return [fld.idfield if mode == 'id' else fld.index for fld in field_struc]
1092        if isinstance(field_struc, AnaField):
1093            return field_struc.idfield if mode == 'id' else field_struc.index
1094        return field_struc

return a representation of a AnaDfields structure (fields, id, index).

Parameters

  • mode : str - AnaDfield representation ('field', 'id', 'index')
  • field_struc : list or dict - structure to represent
@staticmethod
def reduce_dic(obj):
1096    @staticmethod
1097    def reduce_dic(obj):
1098        '''return a dict without None values'''
1099        if isinstance(obj, dict):
1100            return {key: Util.reduce_dic(val) for key, val in obj.items() 
1101                    if not val is None}
1102        if isinstance(obj, list):
1103            return [Util.reduce_dic(val) for val in obj]
1104        return obj

return a dict without None values

@staticmethod
def clean_dic(obj, old, new):
1106    @staticmethod
1107    def clean_dic(obj, old, new):
1108        '''return a dict or list with updated strings by replacing "old" substring
1109        with "new" substring'''
1110        if isinstance(obj, dict):
1111            return {Util.clean_dic(key, old, new): Util.clean_dic(val, old, new)
1112                    for key, val in obj.items()}
1113        if isinstance(obj, str):
1114            return obj.replace(old, new)
1115        if isinstance(obj, list):
1116            return [Util.clean_dic(val, old, new) for val in obj]
1117        return obj

return a dict or list with updated strings by replacing "old" substring with "new" substring

@staticmethod
def filter_dic(obj, keys):
1119    @staticmethod
1120    def filter_dic(obj, keys):
1121        '''return extract of a list of dict or of a dict
1122
1123         *Parameters*
1124
1125        - **keys** : string, list or tuple - list of keys or single key to return
1126        if 'all' or None, all keys are returned
1127        if list, only keys in list are returned
1128        if string, only values associated to the string(key) are returned'''
1129        if not keys or keys == 'all':
1130            return obj
1131        if isinstance(obj, list):
1132            return [Util.filter_dic(dic, keys) for dic in obj]
1133        if isinstance(keys, str) and isinstance(obj, dict):
1134            return obj.get(keys, None)
1135        if isinstance(keys, (list, tuple)) and isinstance(obj, dict):
1136            return {key: val for key, val in obj.items() if key in keys}
1137        return obj

return extract of a list of dict or of a dict

Parameters

  • keys : string, list or tuple - list of keys or single key to return if 'all' or None, all keys are returned if list, only keys in list are returned if string, only values associated to the string(key) are returned
class AnaError(builtins.Exception):
1140class AnaError(Exception):
1141    ''' Analysis Exception'''
1142    # pass

Analysis Exception

Inherited Members
builtins.Exception
Exception
builtins.BaseException
with_traceback