tab-analysis.tab_analysis.analysis
This module analyses structure and relationships included in a tabular object (Pandas DataFrame, Dataset, list of list) :
- Structure of a single field (class
AnaField
), - Relationship between two fields (class
AnaRelation
) - Structure and relationships of fields inside a dataset (class
AnaDfield
) - Structure of a dataset (class
AnaDataset
)
1# -*- coding: utf-8 -*- 2""" 3This module analyses structure and relationships included in a tabular object 4(Pandas DataFrame, Dataset, list of list) : 5- Structure of a single field (class `AnaField`), 6- Relationship between two fields (class `AnaRelation`) 7- Structure and relationships of fields inside a dataset (class `AnaDfield`) 8- Structure of a dataset (class `AnaDataset`) 9 10It contains two another classes `Util`, `AnaError`. 11""" 12import json 13import pprint 14from itertools import combinations 15from operator import mul 16from functools import reduce 17 18NULL = 'null' 19UNIQUE = 'unique' 20COMPLETE = 'complete' 21FULL = 'full' 22DEFAULT = 'default' 23MIXED = 'mixed' 24 25COUPLED = 'coupled' 26DERIVED = 'derived' 27LINKED = 'linked' 28CROSSED = 'crossed' 29DISTRIBUTED = 'distributed' 30ROOTED = 'rooted' 31ROOT = 'root' 32 33IDFIELD = 'id' 34MINCODEC = 'mincodec' 35MAXCODEC = 'maxcodec' 36LENCODEC = 'lencodec' 37RATECODEC = 'ratecodec' 38DMINCODEC = 'dmincodec' 39DMAXCODEC = 'dmaxcodec' 40RANCODEC = 'rancodec' 41TYPECODEC = 'typecodec' 42HASHF = 'hashf' 43RELATION = 'relation' 44HASHR = 'hashr' 45DIST = 'dist' 46DMAX = 'dmax' 47DMIN = 'dmin' 48DIFF = 'diff' 49DRAN = 'dran' 50NUM = 'num' 51CATEGORY = 'category' 52PDERIVED = 'pderived' 53PDISTANCE = 'pdistance' 54PDISTOMIN = 'pdistomin' 55DISDISTANCE = 'disdistance' 56DERDISTANCE = 'derdistance' 57DISRATECPL = 'disratecpl' 58DERRATECPL = 'derratecpl' 59DISRATEDER = 'disrateder' 60DERRATEDER = 'derrateder' 61 62TYPECOUPL = 'typecoupl' 63PARENTCHILD = 'parentchild' 64DISTANCE = 'distance' 65DISTOMIN = 'distomin' 66DISTOMAX = 'distomax' 67DISTROOT = 'distroot' 68RATECPL = 'ratecpl' 69RATEDER = 'rateder' 70 71IDDATASET = 'name' 72RELATIONS = 'relations' 73FIELDS = 'fields' 74LENGTH = 'length' 75HASHD = 'hashd' 76 77 78class AnaField: 79 '''This class analyses field entities. 80 81 *Attributes* 82 83 - **idfield** : string - name or Id of the field 84 - **lencodec**: integer - codec length 85 - **mincodec**: integer - minimal codec length 86 - **maxcodec**: integer - minimal codec length 87 - **hashf**: integer - hash value to identify modifications 88 89 *characteristic (@property)* 90 91 - `iscomplete` 92 - `ratecodec` 93 - `dmincodec` 94 - `dmaxcodec` 95 - `rancodec` 96 - `typecodec` 97 98 *instance methods* 99 100 - `to_dict` 101 102 ''' 103 104 def __init__(self, idfield, lencodec=None, mincodec=None, maxcodec=None, hashf=None): 105 '''Creation mode : 106 - single dict attribute where keys are attributes name, 107 - single AnaField attribute to make a copy 108 - multiple attributes 109 110 *Parameters (multiple attributes)* 111 112 - **idfield** : string or integer - Id of the Field 113 - **lencodec** : integer (default None) - length of the codec 114 - **mincodec** : integer (default None) - number of different values 115 - **maxcodec** : integer (default None) - length of the field 116 - **hashf** : string (default None) - update identifier 117 118 *example* 119 120 AnaField is created with a dict 121 >>> AnaField(Cfield([1,2,3,3]).to_analysis).to_dict() 122 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 123 >>> AnaField({'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}) 124 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 125 126 AnaField is created with parameters 127 >>> AnaField(lencodec=4, mincodec=3, maxcodec=4).to_dict() 128 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 129 >>> AnaField(4, 3, 4).to_dict() 130 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 131 ''' 132 if isinstance(idfield, dict): 133 self.idfield = idfield.get(IDFIELD, None) 134 self.lencodec = idfield.get(LENCODEC, None) 135 self.mincodec = idfield.get(MINCODEC, None) 136 self.maxcodec = idfield.get(MAXCODEC, None) 137 self.hashf = idfield.get(HASHF, None) 138 return 139 if isinstance(idfield, (AnaField, AnaDfield)): 140 self.idfield = idfield.idfield 141 self.lencodec = idfield.lencodec 142 self.mincodec = idfield.mincodec 143 self.maxcodec = idfield.maxcodec 144 self.hashf = idfield.hashf 145 return 146 if not lencodec or not isinstance(lencodec, int): 147 raise AnaError("lencodec is not correct") 148 self.idfield = idfield 149 self.lencodec = lencodec 150 self.mincodec = mincodec 151 self.maxcodec = maxcodec 152 self.hashf = hashf 153 154 def __len__(self): 155 '''length of the field (maxcodec)''' 156 return self.maxcodec if self.maxcodec else self.lencodec 157 158 def __repr__(self): 159 '''representation of the field (class name + idfield)''' 160 return self.__class__.__name__ + '(' + self.idfield + ')' 161 162 def __eq__(self, other): 163 ''' equal if class and attributes are equal''' 164 return self.__class__ .__name__ == other.__class__.__name__ and \ 165 self.idfield == other.idfield and self.lencodec == other.lencodec and \ 166 self.mincodec == other.mincodec and self.maxcodec == other.maxcodec and \ 167 self.hashf == other.hashf 168 169 def __lt__(self, other): 170 ''' return a comparison between hash value''' 171 return hash(self) < hash(other) 172 173 def __hash__(self): 174 '''return hash value (sum of attributes hash)''' 175 return hash(self.idfield) + hash(self.lencodec) + hash(self.mincodec) \ 176 + hash(self.maxcodec) + hash(self.hashf) 177 178 def __str__(self): 179 '''json-text build with the attributes dict''' 180 return json.dumps(self.to_dict(idfield=True)) 181 182 def __copy__(self): 183 ''' Copy all the attributes ''' 184 return self.__class__(self) 185 186 def to_dict(self, full=False, idfield=False, notnone=True): 187 '''return a dict with field attributes. 188 189 *Parameters* 190 191 - **full** : boolean (default False) - if True, all the attributes are included 192 - **idfield** : boolean (default False) - if True, idfield is included 193 - **notnone** : boolean (default True) - if True, None values are not included 194 ''' 195 dic = {LENCODEC: self.lencodec, MINCODEC: self.mincodec, 196 MAXCODEC: self.maxcodec} 197 if idfield or full: 198 dic[IDFIELD] = self.idfield 199 if full: 200 dic |= {RATECODEC: self.ratecodec, DMINCODEC: self.dmincodec, 201 DMAXCODEC: self.dmaxcodec, RANCODEC: self.rancodec, 202 TYPECODEC: self.typecodec} 203 if notnone: 204 return Util.reduce_dic(dic) 205 return dic 206 207 @property 208 def iscomplete(self): 209 '''return boolean indicator : True if all attributes are present''' 210 return not self.maxcodec is None and not self.mincodec is None 211 212 @property 213 def ratecodec(self): 214 '''return float ratecodec indicator''' 215 if self.iscomplete and self.maxcodec - self.mincodec: 216 return (self.maxcodec - self.lencodec) / (self.maxcodec - self.mincodec) 217 return None 218 219 @property 220 def dmincodec(self): 221 '''return integer dmincodec indicator''' 222 return self.lencodec - self.mincodec if self.iscomplete else None 223 224 @property 225 def dmaxcodec(self): 226 '''return integer dmaxcodec indicator''' 227 return self.maxcodec - self.lencodec if self.iscomplete else None 228 229 @property 230 def rancodec(self): 231 '''return integer rancodec indicator''' 232 return self.maxcodec - self.mincodec if self.iscomplete else None 233 234 @property 235 def typecodec(self): 236 '''return string typecodec indicator 237 (null, unique, complete, full, default, mixed) 238 ''' 239 if self.maxcodec is None or self.mincodec is None: 240 return None 241 if self.maxcodec == 0: 242 return NULL 243 if self.lencodec == 1: 244 return UNIQUE 245 if self.mincodec == self.maxcodec: 246 return COMPLETE 247 if self.lencodec == self.maxcodec: 248 return FULL 249 if self.lencodec == self.mincodec: 250 return DEFAULT 251 return MIXED 252 253 254class AnaRelation: 255 '''This class analyses relationship between two fields 256 257 *Attributes* : 258 259 - **relation** : List of the two fields involved in the relationship 260 - **dist** : value of the relationship 261 - **distrib** : boolean True if values are distributed 262 - **hashr**: integer - hash value to identify update 263 264 *global (@property)* 265 266 - `id_relation` 267 - `index_relation` 268 - `parent_child` 269 - `typecoupl` 270 271 *characteristic (@property)* 272 273 - `dmax` 274 - `dmin` 275 - `diff` 276 - `dran` 277 - `distomin` 278 - `distomax` 279 - `distance` 280 - `ratecpl` 281 - `rateder` 282 283 *instance methods* 284 285 - `to_dict` 286 ''' 287 288 def __init__(self, relation, dists, hashr=None): 289 '''Constructor of the relationship : 290 291 *Parameters* 292 293 - **relation** : List of the two fields involved in the relationship 294 - **dists** : dist value or list of dist value and distrib boolean 295 - **distrib** : boolean True if values are distributed 296 - **hashr**: integer - hash value to identify update 297 ''' 298 self.relation = relation 299 if isinstance(dists, list): 300 self.dist = dists[0] 301 self.distrib = dists[1] 302 else: 303 self.dist = dists 304 self.distrib = None 305 self.hashr = hashr 306 307 def __repr__(self): 308 '''representation of the field (class name + idfield)''' 309 return self.__class__.__name__ + '(' + str(self.id_relation) + ')' 310 311 def __str__(self): 312 '''json-text build with the attributes dict''' 313 return json.dumps(self.to_dict(relation=True)) 314 315 def __eq__(self, other): 316 ''' equal if class and values are equal''' 317 return self.__class__ .__name__ == other.__class__.__name__ and \ 318 self.relation == other.relation and self.dist == other.dist and \ 319 self.hashr == other.hashr and self.distrib == other.distrib 320 321 def __hash__(self): 322 '''return hash value (sum of attributes hash)''' 323 return hash(self.relation[0]) + hash(self.relation[1]) + \ 324 hash(self.dist) + hash(self.hashr) + hash(self.distrib) 325 326 def to_dict(self, distances=False, full=False, mode='field', relation=False, 327 notnone=True, misc=False): 328 '''return a dict with AnaRelation attributes. 329 330 *Parameters* 331 332 - **distances** : boolean (default False) - if True, distances indicators are included 333 - **full** : boolean (default False) - if True, all the attributes are included 334 - **relation** : boolean (default False) - if True, idfield are included 335 - **notnone** : boolean (default True) - if True, None values are not included 336 - **mode** : str (default 'field') - AnaDfield representation ('field', 'id', 'index') 337 ''' 338 dic = {DIST: self.dist, TYPECOUPL: self.typecoupl, HASHR: self.hashr} 339 if relation or full: 340 dic[RELATION] = Util.view(self.relation, mode) 341 #dic[TYPECOUPL] = self.typecoupl 342 dic[PARENTCHILD] = self.parent_child 343 if distances or full: 344 dic |= {DISTANCE: self.distance, DISTOMIN: self.distomin, 345 DISTOMAX: self.distomax, DISTRIBUTED: self.distrib, 346 RATECPL: self.ratecpl, RATEDER: self.rateder} 347 if misc or full: 348 dic |= {DMAX: self.dmax, DMIN: self.dmin, 349 DIFF: self.diff, DRAN: self.dran} 350 if notnone: 351 return Util.reduce_dic(dic) 352 return dic 353 354 @property 355 def id_relation(self): 356 '''return a list with the id of the two fields involved''' 357 if self.relation: 358 return [fld.idfield for fld in self.relation] 359 return [] 360 361 @property 362 def parent_child(self): 363 '''returns the direction of the relationship (True if parent is first)''' 364 rel0 = self.relation[0] 365 rel1 = self.relation[1] 366 # if isinstance(rel0, AnaDfield) and isinstance(rel1, AnaDfield): 367 return (rel0.lencodec > rel1.lencodec or 368 (rel0.lencodec == rel1.lencodec and rel0.index < rel1.index)) 369 # return None 370 371 @property 372 def index_relation(self): 373 '''return a list with the index of the two fields involved''' 374 if self.relation: 375 return [fld.index for fld in self.relation] 376 return [] 377 378 @property 379 def dmax(self): 380 '''return integer dmax indicator''' 381 return self.relation[0].lencodec * self.relation[1].lencodec 382 383 @property 384 def dmin(self): 385 '''return integer dmin indicator''' 386 return max(self.relation[0].lencodec, self.relation[1].lencodec) 387 388 @property 389 def diff(self): 390 '''return integer diff indicator''' 391 return abs(self.relation[0].lencodec - self.relation[1].lencodec) 392 393 @property 394 def dran(self): 395 '''return integer dran indicator''' 396 return self.dmax - self.dmin 397 398 @property 399 def distomin(self): 400 '''return integer distomin indicator''' 401 return self.dist - self.dmin 402 403 @property 404 def distomax(self): 405 '''return integer distomax indicator''' 406 return self.dmax - self.dist 407 408 @property 409 def distance(self): 410 '''return integer distance indicator''' 411 return self.distomin + self.diff 412 413 @property 414 def ratecpl(self): 415 '''return float ratecpl indicator''' 416 disdis = self.distance + self.distomax 417 return 0 if disdis == 0 else self.distance / disdis 418 419 @property 420 def rateder(self): 421 '''return float rateder indicator''' 422 return 0 if self.dran == 0 else self.distomin / self.dran 423 424 @property 425 def typecoupl(self): 426 '''return relationship type (coupled, derived, crossed, linked)''' 427 if self.distance == 0: 428 return COUPLED 429 if self.distomin == 0: 430 return DERIVED 431 if self.distomax == 0: 432 return CROSSED 433 return LINKED 434 435 436class AnaDfield(AnaField): 437 '''This class analyses structure and relationships of fields inside a dataset 438 439 *Attributes* : 440 441 - **dataset** : AnaDataset object where AnaDfield is included 442 - **AnaField attributes** : inheritance of AnaField object 443 444 *relationship (@property)* 445 446 - `list_relations` 447 - `list_p_derived` 448 - `list_c_derived` 449 - `list_coupled` 450 451 *field (@property)* 452 453 - `fields` 454 - `p_derived` 455 - `p_distance` 456 - `p_distomin` 457 458 *global (@property)* 459 460 - `index` 461 - `dist_root` 462 - `category` 463 464 *global (instance methods)* 465 466 - `ascendants` 467 - `to_dict` 468 - `view` 469 470 *other instance methods* 471 472 - `dic_inner_node` 473 ''' 474 def __new__(cls, other, dataset=None): 475 '''initialization of attributes from "other"''' 476 if isinstance(other, AnaDfield): 477 new = AnaDfield.__copy__(other) 478 return new 479 if isinstance(other, AnaField): 480 new = AnaField.__copy__(other) 481 new.__class__ = AnaDfield 482 return new 483 return object.__new__(cls) 484 485 def __init__(self, other, dataset): 486 '''AnaDfield is created by adding a AnaDataset link to an AnaField object. 487 488 *Parameters* 489 490 - **other** : AnaField or AnaDfield to initialize attributes 491 - **dataset** : AnaDataset which includes the AnaDfield 492 ''' 493 self.dataset = dataset 494 495 def __copy__(self): 496 ''' Copy all the data ''' 497 return self.__class__(AnaField(self), self.dataset) 498 499 def __lt__(self, other): 500 ''' return a comparison between field index''' 501 return self.index < other.index 502 503 @property 504 def index(self): 505 '''return the row of the field in the AnaDataset''' 506 if self == self.dataset.root: 507 return -1 508 return self.dataset.fields.index(self) 509 510 @property 511 def fields(self): 512 '''return the list of the fields included in the AnaDataset''' 513 return self.dataset.fields 514 515 @property 516 def list_relations(self): 517 '''return the list of the relations with the AnaDfield''' 518 return list(self.dataset.relations[self].values()) 519 520 @property 521 def list_p_derived(self): 522 '''return the list of the derived relations with the parents of AnaDfield''' 523 return [rel for rel in self.list_relations if rel.typecoupl == DERIVED 524 and not rel.parent_child] 525 526 @property 527 def list_c_derived(self): 528 '''return the list of the derived relations with the childs of AnaDfield''' 529 return [rel for rel in self.list_relations if rel.typecoupl == DERIVED 530 and rel.parent_child 531 and rel.relation[1].category != UNIQUE] 532 533 @property 534 def list_coupled(self): 535 '''return the list of the coupled relations with the AnaDfield''' 536 return [rel for rel in self.list_relations if rel.typecoupl == COUPLED] 537 538 @property 539 def dist_root(self): 540 '''return the distance to the root field''' 541 return len(self.dataset) - self.lencodec 542 543 @property 544 def category(self): 545 '''return AnaDfield category (unique, rooted, coupled, derived, mixed)''' 546 if self.typecodec == UNIQUE: 547 return UNIQUE 548 if self.typecodec in (COMPLETE, FULL): 549 return ROOTED 550 if COUPLED in [rel.typecoupl for rel in self.list_relations 551 if not rel.parent_child]: 552 return COUPLED 553 if not self.list_c_derived: 554 return DERIVED 555 return MIXED 556 557 @property 558 def p_derived(self): 559 '''return the first derived or coupled parent of the AnaDfield''' 560 if self.category in (UNIQUE, ROOTED): 561 return self.dataset.root 562 if self.category == COUPLED: 563 return [rel.relation[1] for rel in self.list_coupled 564 if not rel.relation[1].category == COUPLED][0] 565 if not self.list_p_derived: 566 return self.dataset.root 567 distance_min = min(rel.distance for rel in self.list_p_derived) 568 for rel in self.list_p_derived: 569 if rel.distance == distance_min: 570 if rel.relation[1].category == ROOTED: 571 return self.dataset.root 572 if rel.relation[1].category == MIXED: 573 return rel.relation[1] 574 return self.dataset.root 575 576 @property 577 def p_distance(self): 578 '''return the first parent with minimal distance of the AnaDfield''' 579 return self._p_min_dist() 580 581 @property 582 def p_distomin(self): 583 '''return the first parent with minimal distomin of the AnaDfield''' 584 return self._p_min_dist(False) 585 586 def _p_min_dist(self, distance=True): 587 '''return the parent with minimal distance of the AnaDfield''' 588 if self.category == UNIQUE: 589 return self.dataset.root 590 if distance: 591 dist_up = [rel.distance for rel in self.list_relations if 592 not rel.parent_child] 593 # not rel.parent_child and rel.relation[1].category != COUPLED] 594 else: 595 dist_up = [rel.distomin for rel in self.list_relations if 596 not rel.parent_child] 597 # not rel.parent_child and rel.relation[1].category != COUPLED] 598 if not dist_up or min(dist_up) == self.dist_root: 599 return self.dataset.root 600 dist_min = min(dist_up) 601 if distance: 602 list_dmin = [rel.relation[1] for rel in self.list_relations 603 if rel.distance == dist_min] 604 # if rel.distance == dist_min and not rel.parent_child] 605 else: 606 list_dmin = [rel.relation[1] for rel in self.list_relations 607 if rel.distomin == dist_min] 608 # if rel.distomin == dist_min and not rel.parent_child] 609 max_lencodec = max(fld.lencodec for fld in list_dmin) 610 return [fld for fld in list_dmin if fld.lencodec == max_lencodec][0] 611 612 def to_dict(self, mode='id'): 613 '''return a dict with field attributes. 614 615 *Parameters* 616 617 - **mode** : str (default 'id') - AnaDfield representation ('field', 'id', 'index') 618 ''' 619 dic = super().to_dict(full=True, notnone=False) 620 dic[DISTROOT] = self.dist_root 621 dic[NUM] = self.index 622 dic[CATEGORY] = self.category 623 dic[PDISTANCE] = self.p_distance.view(mode) 624 dic[PDISTOMIN] = self.p_distomin.view(mode) 625 dic[PDERIVED] = self.p_derived.view(mode) 626 return dic 627 628 def view(self, mode='field'): 629 ''' return a representation of the AnaDfield 630 631 *Parameters* 632 633 - **mode** : str (default 'field') - AnaDfield representation ('field', 'id', 'index') 634 ''' 635 return Util.view(self, mode) 636 637 def ascendants(self, typeparent='derived', mode='field'): 638 ''' return the list of the AnaDfield's ascendants in the family tree up to 639 the root AnaDfield. 640 641 *Parameters* 642 643 - **typeparent** : str (default 'derived') - 'derived', 'distance' or 'distomin' 644 - **mode** : str (default 'field') - AnaDfield representation 645 ('field', 'id', 'index') 646 647 *Returns* : list of parents from closest to the most distant. Parents 648 are represented with index, idfield, or object 649 ''' 650 parent = self 651 listparent = [] 652 while parent != self.dataset.root: 653 if typeparent == 'derived': 654 parent = parent.p_derived 655 elif typeparent == 'distance': 656 parent = parent.p_distance 657 else: 658 parent = parent.p_distomin 659 if parent != self.dataset.root: 660 listparent.append(parent) 661 return Util.view(listparent, mode) 662 663 def dic_inner_node(self, mode, lname): 664 '''return a child AnaDfield tree. 665 666 *Parameters* 667 668 - **lname** : integer - maximal length of the names 669 - **mode** : string (default 'derived') - kind of tree : 670 'derived' : derived tree 671 'distance': min distance tree 672 'distomin': min distomin tree 673 674 *Returns* : dict where key is a AnaDfield and value is the list of 675 the childs. 676 ''' 677 adding = '' 678 if mode == 'distance': 679 rel_parent = self.dataset.get_relation(self, self.p_distance) 680 adding = str(rel_parent.distance) + ' - ' 681 elif mode == 'distomin': 682 rel_parent = self.dataset.get_relation(self, self.p_distomin) 683 adding = str(rel_parent.distomin) + ' - ' 684 elif mode == 'derived': 685 rel_parent = self.dataset.get_relation(self, self.p_derived) 686 adding = str(rel_parent.distance) + ' - ' 687 adding += str(self.lencodec) 688 name = self.idfield[:lname] + ' (' + adding + ')' 689 lis = [name.replace(' ', '*').replace("'", '*')] 690 if mode == 'derived': 691 childs = [] 692 if not self.category in (ROOTED, COUPLED): 693 for rel in self.list_coupled: 694 lis.append(rel.relation[1].dic_inner_node(mode, lname)) 695 if not self.category in (ROOTED, UNIQUE): 696 childs = [rel.relation[1] for rel in self.list_relations 697 if rel.relation[1].p_derived == self and 698 rel.relation[1].category != COUPLED] 699 if mode == 'distomin': 700 childs = [rel.relation[1] for rel in self.list_relations 701 if rel.relation[1].p_distomin == self] 702 if mode == 'distance': 703 childs = [rel.relation[1] for rel in self.list_relations 704 if rel.relation[1].p_distance == self] 705 for fld in childs: 706 lis.append(fld.dic_inner_node(mode, lname)) 707 return {str(self.index).ljust(2, '*'): lis} 708 709 710class AnaDataset: 711 '''This class analyses the structure of a dataset. 712 713 *Attributes* : 714 715 - **iddataset** : string or integer - Id of the Dataset 716 - **fields** : list of the AnaDfields included 717 - **relations** : dict of the AnaRelations between two AnaDfields 718 - **hashd** : string - update identifier 719 720 *relationship (@property)* 721 722 - `ana_relations` 723 - `p_relations` 724 725 *field (@property)* 726 727 - `root` 728 - `primary` 729 - `secondary` 730 - `unique` 731 - `variable` 732 733 *global (@property)* 734 735 - `category` 736 - `complete` 737 - `dimension` 738 739 *update (instance methods)* 740 741 - `set_relations` 742 743 744 *access (instance methods)* 745 746 - `get_relation` 747 - `dfield` 748 749 *synthesis (instance methods)* 750 751 - `tree` 752 - `to_dict` 753 - `indicator` 754 - `partitions` 755 - `field_partition` 756 ''' 757 758 def __init__(self, fields=None, relations=None, iddataset=None, 759 leng=None, hashd=None): 760 '''Creation mode : 761 - single dict attribute where keys are attributes name, 762 - single AnaDataset attribute to make a copy 763 - multiple attributes 764 765 *Parameters (multiple attributes)* 766 767 - **idfield** : string or integer - Id of the Field 768 - **lencodec** : integer (default None) - length of the codec 769 - **mincodec** : integer (default None) - number of different values 770 - **maxcodec** : integer (default None) - length of the field 771 - **hashf** : string (default None) - update identifier 772 ''' 773 if isinstance(fields, AnaDataset): 774 self.iddataset = fields.iddataset 775 self.fields = fields.fields 776 self.relations = fields.relations 777 self.hashd = fields.hashd 778 return 779 if isinstance(fields, dict): 780 iddataset = fields.get(IDDATASET, None) 781 leng = fields.get(LENGTH, None) 782 relations = fields.get(RELATIONS, None) 783 hashd = fields.get(HASHD) 784 fields = fields.get(FIELDS, None) 785 self.iddataset = iddataset 786 self.fields = [AnaDfield(AnaField(field), self) 787 for field in fields] if fields else [] 788 if leng: 789 for fld in self.fields: 790 fld.maxcodec = leng 791 self.relations = {field: {} for field in self.fields} 792 if relations: 793 for fld, dic_relation in relations.items(): 794 self.set_relations(fld, dic_relation) 795 self.hashd = hashd 796 797 def __len__(self): 798 '''length of the AnaDataset (len of the AnaDfields included)''' 799 return max(len(fld) for fld in self.fields) 800 801 def __eq__(self, other): 802 ''' equal if class and values are equal''' 803 return self.__class__ .__name__ == other.__class__.__name__ and \ 804 self.fields == other.fields and self.relations == other.relations and \ 805 self.iddataset == other.iddataset and self.hashd == other.hashd 806 807 def __hash__(self): 808 '''return hash value (sum of attributes hash)''' 809 return hash(self.iddataset) + sum(hash(fld) for fld in self.fields) + \ 810 sum(hash(rel) for rel in self.relations) + hash(self.hashd) 811 812 @property 813 def category(self): 814 '''return a list of AnaDfield category (unique, rooted, coupled, derived, mixed)''' 815 return [fld.category for fld in self.fields] 816 817 @property 818 def ana_relations(self): 819 '''return the list of AnaRelation included''' 820 return [rel for fldrel in self.relations.values() for rel in fldrel.values()] 821 822 @property 823 def p_relations(self): 824 '''return the list of oriented AnaRelation (parent first, child second)''' 825 return [rel for rel in self.ana_relations if rel.parent_child] 826 827 @property 828 def root(self): 829 '''return the root AnaDfield''' 830 len_self = len(self) 831 return AnaDfield(AnaField(ROOT, len_self, len_self, len_self), self) 832 833 @property 834 def primary(self): 835 '''return the first partition of the partitions''' 836 part = self.partitions(distributed=True) 837 return part[0] if part else [] 838 839 @property 840 def complete(self): 841 '''return True if the dimension is not 0''' 842 return self.dimension > 0 843 844 @property 845 def dimension(self): 846 '''return the highest partition lenght''' 847 return len(self.primary) 848 849 @property 850 def secondary(self): 851 '''return the derived ou coupled fields from primary''' 852 secondary = [] 853 for field in self.primary: 854 self._add_child(field, secondary) 855 return [fld for fld in secondary if not fld in self.primary] 856 857 @property 858 def unique(self): 859 '''return the unique fields''' 860 return [fld for fld in self.fields if fld.category == UNIQUE] 861 862 @property 863 def variable(self): 864 '''return the variable fields''' 865 return [fld for fld in self.fields 866 if not fld in self.primary + self.secondary + self.unique] 867 868 def set_relations(self, field, dic_relations): 869 '''Add relations in the AnaDataset from a dict. 870 871 *Parameters* 872 873 - **field** : AnaDfield, AnaField or str (idfield) - first relation AnaDfield 874 - **dic_relations** : dict - key is the second relation AnaDfield and 875 value is the dist value or teh list [dist, distrib] 876 ''' 877 fld = self.dfield(field) 878 for other, dist in dic_relations.items(): 879 oth = self.dfield(other) 880 self.relations[fld][oth] = AnaRelation([fld, oth], dist) 881 self.relations[oth][fld] = AnaRelation([oth, fld], dist) 882 883 def get_relation(self, fld1, fld2): 884 '''Return AnaRelation between fld1 and fld2. 885 886 *Parameters* 887 888 - **fld1** : AnaDfield, AnaField, int or str (idfield) - first relation AnaDfield 889 - **fld2** : AnaDfield, AnaField, int or str (idfield) - second relation AnaDfield 890 ''' 891 fl1 = self.dfield(fld1) 892 fl2 = self.dfield(fld2) 893 if self.root in [fl1, fl2]: 894 return AnaRelation([fl1, fl2], len(self)) 895 return self.relations[self.dfield(fld1)][self.dfield(fld2)] 896 897 def dfield(self, fld): 898 '''return the AnaDfield matching with fld. Fld is str, int, AnaDfield or AnaField''' 899 if fld in (-1, ROOT): 900 return self.root 901 if isinstance(fld, AnaDfield): 902 return fld 903 if isinstance(fld, int): 904 return self.fields[fld] 905 if isinstance(fld, str): 906 if fld in [dfld.idfield for dfld in self.fields]: 907 return [dfld for dfld in self.fields if dfld.idfield == fld][0] 908 # return self.root 909 return None 910 return AnaDfield(fld, self) 911 912 def tree(self, mode='derived', width=5, lname=20, string=True): 913 '''return a string with a tree of derived Field. 914 915 *Parameters* 916 917 - **lname** : integer (default 20) - length of the names 918 - **width** : integer (default 5) - length of the lines 919 - **string** : boolean (default True) - if True return str else return dict 920 - **mode** : string (default 'derived') - kind of tree : 921 'derived' : derived tree 922 'distance': min distance tree 923 'distomin': min distomin tree 924 ''' 925 lis = ['root-' + mode + '*(' + str(len(self)) + ')'] 926 if mode == 'distance': 927 childs = [fld for fld in self.fields if fld.p_distance == self.root] 928 elif mode == 'distomin': 929 childs = [fld for fld in self.fields if fld.p_distomin == self.root] 930 elif mode == 'derived': 931 childs = [fld for fld in self.fields if fld.p_derived == self.root] 932 for fld in childs: 933 lis.append(fld.dic_inner_node(mode, lname)) 934 tree = {str(-1).ljust(2, '*'): lis} 935 if string: 936 tre = pprint.pformat(tree, indent=0, width=width) 937 tre = tre.replace('---', ' - ') 938 tre = tre.replace(' ', ' ') 939 tre = tre.replace('*', ' ') 940 for car in ["'", "\"", "{", "[", "]", "}", ","]: 941 tre = tre.replace(car, "") 942 return tre 943 return Util.clean_dic(tree, '*', ' ') 944 945 def to_dict(self, mode='field', keys=None, relations=False): 946 '''return a dict with fields attributes and optionaly relations attributes. 947 948 *Parameters* 949 950 - **mode** : str (default 'field') - AnaDfield representation 951 ('field', 'id', 'index') 952 - **relations** : boolean (default: False) - if False return a list of fields, 953 if True return a dict '{"fields": <list of fields>, "relations": <list of relations>}' 954 - **keys** : string, list or tuple - list of keys or single key to return 955 if 'all' or None, all keys are returned 956 if list, only keys in list are returned 957 if string, only values associated to the string(key) are returned''' 958 fields = Util.filter_dic([fld.to_dict(mode=mode) 959 for fld in self.fields], keys) 960 leng = len(self.fields) 961 if not relations: 962 return fields 963 return {'fields': fields, 'relations': 964 [self.get_relation(i, j).to_dict(full=True, mode=mode) 965 for i in range(-1, leng) for j in range(i + 1, leng)]} 966 967 def partitions(self, mode='field', distributed=True): 968 '''return a list of available partitions (the first is highest). 969 970 *Parameters* 971 972 - **mode** : str (default 'field') - AnaDfield representation 973 ('field', 'id', 'index') 974 - **distributed** : boolean (default True) - Include only distributed fields 975 ''' 976 partit = [[fld] for fld in self.fields if fld.category == ROOTED] 977 crossed = [rel for rel in self.ana_relations if rel.typecoupl == CROSSED 978 # and rel.relation[1].index > rel.relation[0].index 979 and rel.parent_child 980 and rel.relation[0].category != COUPLED 981 and rel.relation[1].category != COUPLED] 982 if distributed: 983 crossed = [rel for rel in crossed if rel.distrib] 984 if crossed and len(crossed) == 1 and crossed[0].dist == len(self): 985 partit.insert(0, crossed[0].relation) 986 elif crossed: 987 for repeat in list(range(len(crossed))): 988 candidates = combinations(crossed, repeat + 1) 989 for candidat in candidates: 990 flds = list(set(rel.relation[i] 991 for rel in candidat for i in [0, 1])) 992 if (reduce(mul, [fld.lencodec for fld in flds]) == len(self) and 993 len(candidat) == sum(range(len(flds))) and 994 (not distributed or min(rel.distrib for rel in candidat))): 995 partit.insert(0, flds) 996 partit = Util.view(partit, mode) 997 return [list(tup) for tup in 998 sorted(sorted(list({tuple(sorted(prt)) for prt in partit})), 999 key=len, reverse=True)] 1000 1001 def field_partition(self, mode='field', partition=None, distributed=True): 1002 '''return a partition dict with the list of primary, secondary, unique 1003 and variable fields. 1004 1005 *Parameters* 1006 1007 - **mode** : str (default 'field') - AnaDfield representation 1008 ('field', 'id', 'index') 1009 - **partition** : list (default None) - if None, partition is the first 1010 - **distributed** : boolean (default True) - Include only distributed fields 1011 ''' 1012 if not partition: 1013 partitions = self.partitions(distributed=distributed) 1014 if not partitions: 1015 return {'primary': [], 'secondary': [], 'unique': [], 'variable': []} 1016 partition = partitions[0] 1017 else: 1018 partition = [self.dfield(fld) for fld in partition] 1019 secondary = [] 1020 for field in partition: 1021 self._add_child(field, secondary) 1022 secondary = [fld for fld in secondary if not fld in partition] 1023 unique = [fld for fld in self.fields if fld.category == UNIQUE] 1024 variable = [fld for fld in self.fields 1025 if not fld in partition + secondary + unique] 1026 return Util.view({'primary': partition, 'secondary': secondary, 1027 'unique': unique, 'variable': variable}, mode) 1028 1029 def indicator(self, fullsize, size): 1030 '''generate size indicators: ol (object lightness), ul (unicity level), 1031 gain (sizegain) 1032 1033 *Parameters* 1034 1035 - **fullsize** : int - size with full codec 1036 - **size** : int - size with existing codec 1037 1038 *Returns* : dict''' 1039 lenindex = len(self.fields) 1040 indexlen = sum(fld.lencodec for fld in self.fields) 1041 nval = len(self) * (lenindex + 1) 1042 sval = fullsize / nval 1043 ncod = indexlen + lenindex 1044 1045 if nval != ncod: 1046 scod = (size - ncod * sval) / (nval - ncod) 1047 olight = scod / sval 1048 else: 1049 olight = None 1050 return {'total values': nval, 'mean size': round(sval, 3), 1051 'unique values': ncod, 'mean coding size': round(scod, 3), 1052 'unicity level': round(ncod / nval, 3), 1053 'optimize level': round(size / fullsize, 3), 1054 'object lightness': round(olight, 3), 1055 'maxgain': round((nval - ncod) / nval, 3), 1056 'gain': round((fullsize - size) / fullsize, 3)} 1057 1058 def _add_child(self, field, childs): 1059 ''' add derived or coupled fields in the childs list''' 1060 for rel in field.list_c_derived + field.list_coupled: 1061 child = rel.relation[1] 1062 if not child in childs and not child.category == UNIQUE: 1063 childs.append(child) 1064 if not child.category in (COUPLED, UNIQUE): 1065 self._add_child(child, childs) 1066 1067 1068class Util: 1069 ''' common functions for analysis package''' 1070 1071 @staticmethod 1072 def view(field_struc, mode): 1073 ''' return a representation of a AnaDfields structure (fields, id, index). 1074 1075 *Parameters* 1076 1077 - **mode** : str - AnaDfield representation ('field', 'id', 'index') 1078 - **field_struc** : list or dict - structure to represent 1079 ''' 1080 if mode is None or mode == 'field' or not field_struc: 1081 return field_struc 1082 if isinstance(field_struc, dict): 1083 return {key: [fld.idfield if mode == 'id' else fld.index for fld in val] 1084 for key, val in field_struc.items()} 1085 if isinstance(field_struc, list) and isinstance(field_struc[0], list): 1086 return [[fld.idfield if mode == 'id' else fld.index for fld in val] 1087 for val in field_struc] 1088 if isinstance(field_struc, list): 1089 return [fld.idfield if mode == 'id' else fld.index for fld in field_struc] 1090 if isinstance(field_struc, AnaField): 1091 return field_struc.idfield if mode == 'id' else field_struc.index 1092 return field_struc 1093 1094 @staticmethod 1095 def reduce_dic(obj): 1096 '''return a dict without empty or None values''' 1097 return {key: val for key, val in obj.items() if not val is None} 1098 1099 @staticmethod 1100 def clean_dic(obj, old, new): 1101 '''return a dict or list with updated strings by replacing "old" substring 1102 with "new" substring''' 1103 if isinstance(obj, dict): 1104 return {Util.clean_dic(key, old, new): Util.clean_dic(val, old, new) 1105 for key, val in obj.items()} 1106 if isinstance(obj, str): 1107 return obj.replace(old, new) 1108 if isinstance(obj, list): 1109 return [Util.clean_dic(val, old, new) for val in obj] 1110 return obj 1111 1112 @staticmethod 1113 def filter_dic(obj, keys): 1114 '''return extract of a list of dict or of a dict 1115 1116 *Parameters* 1117 1118 - **keys** : string, list or tuple - list of keys or single key to return 1119 if 'all' or None, all keys are returned 1120 if list, only keys in list are returned 1121 if string, only values associated to the string(key) are returned''' 1122 if not keys or keys == 'all': 1123 return obj 1124 if isinstance(obj, list): 1125 return [Util.filter_dic(dic, keys) for dic in obj] 1126 if isinstance(keys, str) and isinstance(obj, dict): 1127 return obj.get(keys, None) 1128 if isinstance(keys, (list, tuple)) and isinstance(obj, dict): 1129 return {key: val for key, val in obj.items() if key in keys} 1130 return obj 1131 1132 1133class AnaError(Exception): 1134 ''' Analysis Exception''' 1135 # pass
79class AnaField: 80 '''This class analyses field entities. 81 82 *Attributes* 83 84 - **idfield** : string - name or Id of the field 85 - **lencodec**: integer - codec length 86 - **mincodec**: integer - minimal codec length 87 - **maxcodec**: integer - minimal codec length 88 - **hashf**: integer - hash value to identify modifications 89 90 *characteristic (@property)* 91 92 - `iscomplete` 93 - `ratecodec` 94 - `dmincodec` 95 - `dmaxcodec` 96 - `rancodec` 97 - `typecodec` 98 99 *instance methods* 100 101 - `to_dict` 102 103 ''' 104 105 def __init__(self, idfield, lencodec=None, mincodec=None, maxcodec=None, hashf=None): 106 '''Creation mode : 107 - single dict attribute where keys are attributes name, 108 - single AnaField attribute to make a copy 109 - multiple attributes 110 111 *Parameters (multiple attributes)* 112 113 - **idfield** : string or integer - Id of the Field 114 - **lencodec** : integer (default None) - length of the codec 115 - **mincodec** : integer (default None) - number of different values 116 - **maxcodec** : integer (default None) - length of the field 117 - **hashf** : string (default None) - update identifier 118 119 *example* 120 121 AnaField is created with a dict 122 >>> AnaField(Cfield([1,2,3,3]).to_analysis).to_dict() 123 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 124 >>> AnaField({'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}) 125 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 126 127 AnaField is created with parameters 128 >>> AnaField(lencodec=4, mincodec=3, maxcodec=4).to_dict() 129 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 130 >>> AnaField(4, 3, 4).to_dict() 131 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 132 ''' 133 if isinstance(idfield, dict): 134 self.idfield = idfield.get(IDFIELD, None) 135 self.lencodec = idfield.get(LENCODEC, None) 136 self.mincodec = idfield.get(MINCODEC, None) 137 self.maxcodec = idfield.get(MAXCODEC, None) 138 self.hashf = idfield.get(HASHF, None) 139 return 140 if isinstance(idfield, (AnaField, AnaDfield)): 141 self.idfield = idfield.idfield 142 self.lencodec = idfield.lencodec 143 self.mincodec = idfield.mincodec 144 self.maxcodec = idfield.maxcodec 145 self.hashf = idfield.hashf 146 return 147 if not lencodec or not isinstance(lencodec, int): 148 raise AnaError("lencodec is not correct") 149 self.idfield = idfield 150 self.lencodec = lencodec 151 self.mincodec = mincodec 152 self.maxcodec = maxcodec 153 self.hashf = hashf 154 155 def __len__(self): 156 '''length of the field (maxcodec)''' 157 return self.maxcodec if self.maxcodec else self.lencodec 158 159 def __repr__(self): 160 '''representation of the field (class name + idfield)''' 161 return self.__class__.__name__ + '(' + self.idfield + ')' 162 163 def __eq__(self, other): 164 ''' equal if class and attributes are equal''' 165 return self.__class__ .__name__ == other.__class__.__name__ and \ 166 self.idfield == other.idfield and self.lencodec == other.lencodec and \ 167 self.mincodec == other.mincodec and self.maxcodec == other.maxcodec and \ 168 self.hashf == other.hashf 169 170 def __lt__(self, other): 171 ''' return a comparison between hash value''' 172 return hash(self) < hash(other) 173 174 def __hash__(self): 175 '''return hash value (sum of attributes hash)''' 176 return hash(self.idfield) + hash(self.lencodec) + hash(self.mincodec) \ 177 + hash(self.maxcodec) + hash(self.hashf) 178 179 def __str__(self): 180 '''json-text build with the attributes dict''' 181 return json.dumps(self.to_dict(idfield=True)) 182 183 def __copy__(self): 184 ''' Copy all the attributes ''' 185 return self.__class__(self) 186 187 def to_dict(self, full=False, idfield=False, notnone=True): 188 '''return a dict with field attributes. 189 190 *Parameters* 191 192 - **full** : boolean (default False) - if True, all the attributes are included 193 - **idfield** : boolean (default False) - if True, idfield is included 194 - **notnone** : boolean (default True) - if True, None values are not included 195 ''' 196 dic = {LENCODEC: self.lencodec, MINCODEC: self.mincodec, 197 MAXCODEC: self.maxcodec} 198 if idfield or full: 199 dic[IDFIELD] = self.idfield 200 if full: 201 dic |= {RATECODEC: self.ratecodec, DMINCODEC: self.dmincodec, 202 DMAXCODEC: self.dmaxcodec, RANCODEC: self.rancodec, 203 TYPECODEC: self.typecodec} 204 if notnone: 205 return Util.reduce_dic(dic) 206 return dic 207 208 @property 209 def iscomplete(self): 210 '''return boolean indicator : True if all attributes are present''' 211 return not self.maxcodec is None and not self.mincodec is None 212 213 @property 214 def ratecodec(self): 215 '''return float ratecodec indicator''' 216 if self.iscomplete and self.maxcodec - self.mincodec: 217 return (self.maxcodec - self.lencodec) / (self.maxcodec - self.mincodec) 218 return None 219 220 @property 221 def dmincodec(self): 222 '''return integer dmincodec indicator''' 223 return self.lencodec - self.mincodec if self.iscomplete else None 224 225 @property 226 def dmaxcodec(self): 227 '''return integer dmaxcodec indicator''' 228 return self.maxcodec - self.lencodec if self.iscomplete else None 229 230 @property 231 def rancodec(self): 232 '''return integer rancodec indicator''' 233 return self.maxcodec - self.mincodec if self.iscomplete else None 234 235 @property 236 def typecodec(self): 237 '''return string typecodec indicator 238 (null, unique, complete, full, default, mixed) 239 ''' 240 if self.maxcodec is None or self.mincodec is None: 241 return None 242 if self.maxcodec == 0: 243 return NULL 244 if self.lencodec == 1: 245 return UNIQUE 246 if self.mincodec == self.maxcodec: 247 return COMPLETE 248 if self.lencodec == self.maxcodec: 249 return FULL 250 if self.lencodec == self.mincodec: 251 return DEFAULT 252 return MIXED
This class analyses field entities.
Attributes
- idfield : string - name or Id of the field
- lencodec: integer - codec length
- mincodec: integer - minimal codec length
- maxcodec: integer - minimal codec length
- hashf: integer - hash value to identify modifications
characteristic (@property)
instance methods
105 def __init__(self, idfield, lencodec=None, mincodec=None, maxcodec=None, hashf=None): 106 '''Creation mode : 107 - single dict attribute where keys are attributes name, 108 - single AnaField attribute to make a copy 109 - multiple attributes 110 111 *Parameters (multiple attributes)* 112 113 - **idfield** : string or integer - Id of the Field 114 - **lencodec** : integer (default None) - length of the codec 115 - **mincodec** : integer (default None) - number of different values 116 - **maxcodec** : integer (default None) - length of the field 117 - **hashf** : string (default None) - update identifier 118 119 *example* 120 121 AnaField is created with a dict 122 >>> AnaField(Cfield([1,2,3,3]).to_analysis).to_dict() 123 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 124 >>> AnaField({'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}) 125 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 126 127 AnaField is created with parameters 128 >>> AnaField(lencodec=4, mincodec=3, maxcodec=4).to_dict() 129 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 130 >>> AnaField(4, 3, 4).to_dict() 131 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 132 ''' 133 if isinstance(idfield, dict): 134 self.idfield = idfield.get(IDFIELD, None) 135 self.lencodec = idfield.get(LENCODEC, None) 136 self.mincodec = idfield.get(MINCODEC, None) 137 self.maxcodec = idfield.get(MAXCODEC, None) 138 self.hashf = idfield.get(HASHF, None) 139 return 140 if isinstance(idfield, (AnaField, AnaDfield)): 141 self.idfield = idfield.idfield 142 self.lencodec = idfield.lencodec 143 self.mincodec = idfield.mincodec 144 self.maxcodec = idfield.maxcodec 145 self.hashf = idfield.hashf 146 return 147 if not lencodec or not isinstance(lencodec, int): 148 raise AnaError("lencodec is not correct") 149 self.idfield = idfield 150 self.lencodec = lencodec 151 self.mincodec = mincodec 152 self.maxcodec = maxcodec 153 self.hashf = hashf
Creation mode :
- single dict attribute where keys are attributes name,
- single AnaField attribute to make a copy
- multiple attributes
Parameters (multiple attributes)
- idfield : string or integer - Id of the Field
- lencodec : integer (default None) - length of the codec
- mincodec : integer (default None) - number of different values
- maxcodec : integer (default None) - length of the field
- hashf : string (default None) - update identifier
example
AnaField is created with a dict
>>> AnaField(Cfield([1,2,3,3]).to_analysis).to_dict()
{'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
>>> AnaField({'lencodec': 4, 'mincodec': 3, 'maxcodec': 4})
{'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
AnaField is created with parameters
>>> AnaField(lencodec=4, mincodec=3, maxcodec=4).to_dict()
{'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
>>> AnaField(4, 3, 4).to_dict()
{'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
187 def to_dict(self, full=False, idfield=False, notnone=True): 188 '''return a dict with field attributes. 189 190 *Parameters* 191 192 - **full** : boolean (default False) - if True, all the attributes are included 193 - **idfield** : boolean (default False) - if True, idfield is included 194 - **notnone** : boolean (default True) - if True, None values are not included 195 ''' 196 dic = {LENCODEC: self.lencodec, MINCODEC: self.mincodec, 197 MAXCODEC: self.maxcodec} 198 if idfield or full: 199 dic[IDFIELD] = self.idfield 200 if full: 201 dic |= {RATECODEC: self.ratecodec, DMINCODEC: self.dmincodec, 202 DMAXCODEC: self.dmaxcodec, RANCODEC: self.rancodec, 203 TYPECODEC: self.typecodec} 204 if notnone: 205 return Util.reduce_dic(dic) 206 return dic
return a dict with field attributes.
Parameters
- full : boolean (default False) - if True, all the attributes are included
- idfield : boolean (default False) - if True, idfield is included
- notnone : boolean (default True) - if True, None values are not included
255class AnaRelation: 256 '''This class analyses relationship between two fields 257 258 *Attributes* : 259 260 - **relation** : List of the two fields involved in the relationship 261 - **dist** : value of the relationship 262 - **distrib** : boolean True if values are distributed 263 - **hashr**: integer - hash value to identify update 264 265 *global (@property)* 266 267 - `id_relation` 268 - `index_relation` 269 - `parent_child` 270 - `typecoupl` 271 272 *characteristic (@property)* 273 274 - `dmax` 275 - `dmin` 276 - `diff` 277 - `dran` 278 - `distomin` 279 - `distomax` 280 - `distance` 281 - `ratecpl` 282 - `rateder` 283 284 *instance methods* 285 286 - `to_dict` 287 ''' 288 289 def __init__(self, relation, dists, hashr=None): 290 '''Constructor of the relationship : 291 292 *Parameters* 293 294 - **relation** : List of the two fields involved in the relationship 295 - **dists** : dist value or list of dist value and distrib boolean 296 - **distrib** : boolean True if values are distributed 297 - **hashr**: integer - hash value to identify update 298 ''' 299 self.relation = relation 300 if isinstance(dists, list): 301 self.dist = dists[0] 302 self.distrib = dists[1] 303 else: 304 self.dist = dists 305 self.distrib = None 306 self.hashr = hashr 307 308 def __repr__(self): 309 '''representation of the field (class name + idfield)''' 310 return self.__class__.__name__ + '(' + str(self.id_relation) + ')' 311 312 def __str__(self): 313 '''json-text build with the attributes dict''' 314 return json.dumps(self.to_dict(relation=True)) 315 316 def __eq__(self, other): 317 ''' equal if class and values are equal''' 318 return self.__class__ .__name__ == other.__class__.__name__ and \ 319 self.relation == other.relation and self.dist == other.dist and \ 320 self.hashr == other.hashr and self.distrib == other.distrib 321 322 def __hash__(self): 323 '''return hash value (sum of attributes hash)''' 324 return hash(self.relation[0]) + hash(self.relation[1]) + \ 325 hash(self.dist) + hash(self.hashr) + hash(self.distrib) 326 327 def to_dict(self, distances=False, full=False, mode='field', relation=False, 328 notnone=True, misc=False): 329 '''return a dict with AnaRelation attributes. 330 331 *Parameters* 332 333 - **distances** : boolean (default False) - if True, distances indicators are included 334 - **full** : boolean (default False) - if True, all the attributes are included 335 - **relation** : boolean (default False) - if True, idfield are included 336 - **notnone** : boolean (default True) - if True, None values are not included 337 - **mode** : str (default 'field') - AnaDfield representation ('field', 'id', 'index') 338 ''' 339 dic = {DIST: self.dist, TYPECOUPL: self.typecoupl, HASHR: self.hashr} 340 if relation or full: 341 dic[RELATION] = Util.view(self.relation, mode) 342 #dic[TYPECOUPL] = self.typecoupl 343 dic[PARENTCHILD] = self.parent_child 344 if distances or full: 345 dic |= {DISTANCE: self.distance, DISTOMIN: self.distomin, 346 DISTOMAX: self.distomax, DISTRIBUTED: self.distrib, 347 RATECPL: self.ratecpl, RATEDER: self.rateder} 348 if misc or full: 349 dic |= {DMAX: self.dmax, DMIN: self.dmin, 350 DIFF: self.diff, DRAN: self.dran} 351 if notnone: 352 return Util.reduce_dic(dic) 353 return dic 354 355 @property 356 def id_relation(self): 357 '''return a list with the id of the two fields involved''' 358 if self.relation: 359 return [fld.idfield for fld in self.relation] 360 return [] 361 362 @property 363 def parent_child(self): 364 '''returns the direction of the relationship (True if parent is first)''' 365 rel0 = self.relation[0] 366 rel1 = self.relation[1] 367 # if isinstance(rel0, AnaDfield) and isinstance(rel1, AnaDfield): 368 return (rel0.lencodec > rel1.lencodec or 369 (rel0.lencodec == rel1.lencodec and rel0.index < rel1.index)) 370 # return None 371 372 @property 373 def index_relation(self): 374 '''return a list with the index of the two fields involved''' 375 if self.relation: 376 return [fld.index for fld in self.relation] 377 return [] 378 379 @property 380 def dmax(self): 381 '''return integer dmax indicator''' 382 return self.relation[0].lencodec * self.relation[1].lencodec 383 384 @property 385 def dmin(self): 386 '''return integer dmin indicator''' 387 return max(self.relation[0].lencodec, self.relation[1].lencodec) 388 389 @property 390 def diff(self): 391 '''return integer diff indicator''' 392 return abs(self.relation[0].lencodec - self.relation[1].lencodec) 393 394 @property 395 def dran(self): 396 '''return integer dran indicator''' 397 return self.dmax - self.dmin 398 399 @property 400 def distomin(self): 401 '''return integer distomin indicator''' 402 return self.dist - self.dmin 403 404 @property 405 def distomax(self): 406 '''return integer distomax indicator''' 407 return self.dmax - self.dist 408 409 @property 410 def distance(self): 411 '''return integer distance indicator''' 412 return self.distomin + self.diff 413 414 @property 415 def ratecpl(self): 416 '''return float ratecpl indicator''' 417 disdis = self.distance + self.distomax 418 return 0 if disdis == 0 else self.distance / disdis 419 420 @property 421 def rateder(self): 422 '''return float rateder indicator''' 423 return 0 if self.dran == 0 else self.distomin / self.dran 424 425 @property 426 def typecoupl(self): 427 '''return relationship type (coupled, derived, crossed, linked)''' 428 if self.distance == 0: 429 return COUPLED 430 if self.distomin == 0: 431 return DERIVED 432 if self.distomax == 0: 433 return CROSSED 434 return LINKED
This class analyses relationship between two fields
Attributes :
- relation : List of the two fields involved in the relationship
- dist : value of the relationship
- distrib : boolean True if values are distributed
- hashr: integer - hash value to identify update
global (@property)
characteristic (@property)
instance methods
289 def __init__(self, relation, dists, hashr=None): 290 '''Constructor of the relationship : 291 292 *Parameters* 293 294 - **relation** : List of the two fields involved in the relationship 295 - **dists** : dist value or list of dist value and distrib boolean 296 - **distrib** : boolean True if values are distributed 297 - **hashr**: integer - hash value to identify update 298 ''' 299 self.relation = relation 300 if isinstance(dists, list): 301 self.dist = dists[0] 302 self.distrib = dists[1] 303 else: 304 self.dist = dists 305 self.distrib = None 306 self.hashr = hashr
Constructor of the relationship :
Parameters
- relation : List of the two fields involved in the relationship
- dists : dist value or list of dist value and distrib boolean
- distrib : boolean True if values are distributed
- hashr: integer - hash value to identify update
327 def to_dict(self, distances=False, full=False, mode='field', relation=False, 328 notnone=True, misc=False): 329 '''return a dict with AnaRelation attributes. 330 331 *Parameters* 332 333 - **distances** : boolean (default False) - if True, distances indicators are included 334 - **full** : boolean (default False) - if True, all the attributes are included 335 - **relation** : boolean (default False) - if True, idfield are included 336 - **notnone** : boolean (default True) - if True, None values are not included 337 - **mode** : str (default 'field') - AnaDfield representation ('field', 'id', 'index') 338 ''' 339 dic = {DIST: self.dist, TYPECOUPL: self.typecoupl, HASHR: self.hashr} 340 if relation or full: 341 dic[RELATION] = Util.view(self.relation, mode) 342 #dic[TYPECOUPL] = self.typecoupl 343 dic[PARENTCHILD] = self.parent_child 344 if distances or full: 345 dic |= {DISTANCE: self.distance, DISTOMIN: self.distomin, 346 DISTOMAX: self.distomax, DISTRIBUTED: self.distrib, 347 RATECPL: self.ratecpl, RATEDER: self.rateder} 348 if misc or full: 349 dic |= {DMAX: self.dmax, DMIN: self.dmin, 350 DIFF: self.diff, DRAN: self.dran} 351 if notnone: 352 return Util.reduce_dic(dic) 353 return dic
return a dict with AnaRelation attributes.
Parameters
- distances : boolean (default False) - if True, distances indicators are included
- full : boolean (default False) - if True, all the attributes are included
- relation : boolean (default False) - if True, idfield are included
- notnone : boolean (default True) - if True, None values are not included
- mode : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
437class AnaDfield(AnaField): 438 '''This class analyses structure and relationships of fields inside a dataset 439 440 *Attributes* : 441 442 - **dataset** : AnaDataset object where AnaDfield is included 443 - **AnaField attributes** : inheritance of AnaField object 444 445 *relationship (@property)* 446 447 - `list_relations` 448 - `list_p_derived` 449 - `list_c_derived` 450 - `list_coupled` 451 452 *field (@property)* 453 454 - `fields` 455 - `p_derived` 456 - `p_distance` 457 - `p_distomin` 458 459 *global (@property)* 460 461 - `index` 462 - `dist_root` 463 - `category` 464 465 *global (instance methods)* 466 467 - `ascendants` 468 - `to_dict` 469 - `view` 470 471 *other instance methods* 472 473 - `dic_inner_node` 474 ''' 475 def __new__(cls, other, dataset=None): 476 '''initialization of attributes from "other"''' 477 if isinstance(other, AnaDfield): 478 new = AnaDfield.__copy__(other) 479 return new 480 if isinstance(other, AnaField): 481 new = AnaField.__copy__(other) 482 new.__class__ = AnaDfield 483 return new 484 return object.__new__(cls) 485 486 def __init__(self, other, dataset): 487 '''AnaDfield is created by adding a AnaDataset link to an AnaField object. 488 489 *Parameters* 490 491 - **other** : AnaField or AnaDfield to initialize attributes 492 - **dataset** : AnaDataset which includes the AnaDfield 493 ''' 494 self.dataset = dataset 495 496 def __copy__(self): 497 ''' Copy all the data ''' 498 return self.__class__(AnaField(self), self.dataset) 499 500 def __lt__(self, other): 501 ''' return a comparison between field index''' 502 return self.index < other.index 503 504 @property 505 def index(self): 506 '''return the row of the field in the AnaDataset''' 507 if self == self.dataset.root: 508 return -1 509 return self.dataset.fields.index(self) 510 511 @property 512 def fields(self): 513 '''return the list of the fields included in the AnaDataset''' 514 return self.dataset.fields 515 516 @property 517 def list_relations(self): 518 '''return the list of the relations with the AnaDfield''' 519 return list(self.dataset.relations[self].values()) 520 521 @property 522 def list_p_derived(self): 523 '''return the list of the derived relations with the parents of AnaDfield''' 524 return [rel for rel in self.list_relations if rel.typecoupl == DERIVED 525 and not rel.parent_child] 526 527 @property 528 def list_c_derived(self): 529 '''return the list of the derived relations with the childs of AnaDfield''' 530 return [rel for rel in self.list_relations if rel.typecoupl == DERIVED 531 and rel.parent_child 532 and rel.relation[1].category != UNIQUE] 533 534 @property 535 def list_coupled(self): 536 '''return the list of the coupled relations with the AnaDfield''' 537 return [rel for rel in self.list_relations if rel.typecoupl == COUPLED] 538 539 @property 540 def dist_root(self): 541 '''return the distance to the root field''' 542 return len(self.dataset) - self.lencodec 543 544 @property 545 def category(self): 546 '''return AnaDfield category (unique, rooted, coupled, derived, mixed)''' 547 if self.typecodec == UNIQUE: 548 return UNIQUE 549 if self.typecodec in (COMPLETE, FULL): 550 return ROOTED 551 if COUPLED in [rel.typecoupl for rel in self.list_relations 552 if not rel.parent_child]: 553 return COUPLED 554 if not self.list_c_derived: 555 return DERIVED 556 return MIXED 557 558 @property 559 def p_derived(self): 560 '''return the first derived or coupled parent of the AnaDfield''' 561 if self.category in (UNIQUE, ROOTED): 562 return self.dataset.root 563 if self.category == COUPLED: 564 return [rel.relation[1] for rel in self.list_coupled 565 if not rel.relation[1].category == COUPLED][0] 566 if not self.list_p_derived: 567 return self.dataset.root 568 distance_min = min(rel.distance for rel in self.list_p_derived) 569 for rel in self.list_p_derived: 570 if rel.distance == distance_min: 571 if rel.relation[1].category == ROOTED: 572 return self.dataset.root 573 if rel.relation[1].category == MIXED: 574 return rel.relation[1] 575 return self.dataset.root 576 577 @property 578 def p_distance(self): 579 '''return the first parent with minimal distance of the AnaDfield''' 580 return self._p_min_dist() 581 582 @property 583 def p_distomin(self): 584 '''return the first parent with minimal distomin of the AnaDfield''' 585 return self._p_min_dist(False) 586 587 def _p_min_dist(self, distance=True): 588 '''return the parent with minimal distance of the AnaDfield''' 589 if self.category == UNIQUE: 590 return self.dataset.root 591 if distance: 592 dist_up = [rel.distance for rel in self.list_relations if 593 not rel.parent_child] 594 # not rel.parent_child and rel.relation[1].category != COUPLED] 595 else: 596 dist_up = [rel.distomin for rel in self.list_relations if 597 not rel.parent_child] 598 # not rel.parent_child and rel.relation[1].category != COUPLED] 599 if not dist_up or min(dist_up) == self.dist_root: 600 return self.dataset.root 601 dist_min = min(dist_up) 602 if distance: 603 list_dmin = [rel.relation[1] for rel in self.list_relations 604 if rel.distance == dist_min] 605 # if rel.distance == dist_min and not rel.parent_child] 606 else: 607 list_dmin = [rel.relation[1] for rel in self.list_relations 608 if rel.distomin == dist_min] 609 # if rel.distomin == dist_min and not rel.parent_child] 610 max_lencodec = max(fld.lencodec for fld in list_dmin) 611 return [fld for fld in list_dmin if fld.lencodec == max_lencodec][0] 612 613 def to_dict(self, mode='id'): 614 '''return a dict with field attributes. 615 616 *Parameters* 617 618 - **mode** : str (default 'id') - AnaDfield representation ('field', 'id', 'index') 619 ''' 620 dic = super().to_dict(full=True, notnone=False) 621 dic[DISTROOT] = self.dist_root 622 dic[NUM] = self.index 623 dic[CATEGORY] = self.category 624 dic[PDISTANCE] = self.p_distance.view(mode) 625 dic[PDISTOMIN] = self.p_distomin.view(mode) 626 dic[PDERIVED] = self.p_derived.view(mode) 627 return dic 628 629 def view(self, mode='field'): 630 ''' return a representation of the AnaDfield 631 632 *Parameters* 633 634 - **mode** : str (default 'field') - AnaDfield representation ('field', 'id', 'index') 635 ''' 636 return Util.view(self, mode) 637 638 def ascendants(self, typeparent='derived', mode='field'): 639 ''' return the list of the AnaDfield's ascendants in the family tree up to 640 the root AnaDfield. 641 642 *Parameters* 643 644 - **typeparent** : str (default 'derived') - 'derived', 'distance' or 'distomin' 645 - **mode** : str (default 'field') - AnaDfield representation 646 ('field', 'id', 'index') 647 648 *Returns* : list of parents from closest to the most distant. Parents 649 are represented with index, idfield, or object 650 ''' 651 parent = self 652 listparent = [] 653 while parent != self.dataset.root: 654 if typeparent == 'derived': 655 parent = parent.p_derived 656 elif typeparent == 'distance': 657 parent = parent.p_distance 658 else: 659 parent = parent.p_distomin 660 if parent != self.dataset.root: 661 listparent.append(parent) 662 return Util.view(listparent, mode) 663 664 def dic_inner_node(self, mode, lname): 665 '''return a child AnaDfield tree. 666 667 *Parameters* 668 669 - **lname** : integer - maximal length of the names 670 - **mode** : string (default 'derived') - kind of tree : 671 'derived' : derived tree 672 'distance': min distance tree 673 'distomin': min distomin tree 674 675 *Returns* : dict where key is a AnaDfield and value is the list of 676 the childs. 677 ''' 678 adding = '' 679 if mode == 'distance': 680 rel_parent = self.dataset.get_relation(self, self.p_distance) 681 adding = str(rel_parent.distance) + ' - ' 682 elif mode == 'distomin': 683 rel_parent = self.dataset.get_relation(self, self.p_distomin) 684 adding = str(rel_parent.distomin) + ' - ' 685 elif mode == 'derived': 686 rel_parent = self.dataset.get_relation(self, self.p_derived) 687 adding = str(rel_parent.distance) + ' - ' 688 adding += str(self.lencodec) 689 name = self.idfield[:lname] + ' (' + adding + ')' 690 lis = [name.replace(' ', '*').replace("'", '*')] 691 if mode == 'derived': 692 childs = [] 693 if not self.category in (ROOTED, COUPLED): 694 for rel in self.list_coupled: 695 lis.append(rel.relation[1].dic_inner_node(mode, lname)) 696 if not self.category in (ROOTED, UNIQUE): 697 childs = [rel.relation[1] for rel in self.list_relations 698 if rel.relation[1].p_derived == self and 699 rel.relation[1].category != COUPLED] 700 if mode == 'distomin': 701 childs = [rel.relation[1] for rel in self.list_relations 702 if rel.relation[1].p_distomin == self] 703 if mode == 'distance': 704 childs = [rel.relation[1] for rel in self.list_relations 705 if rel.relation[1].p_distance == self] 706 for fld in childs: 707 lis.append(fld.dic_inner_node(mode, lname)) 708 return {str(self.index).ljust(2, '*'): lis}
This class analyses structure and relationships of fields inside a dataset
Attributes :
- dataset : AnaDataset object where AnaDfield is included
- AnaField attributes : inheritance of AnaField object
relationship (@property)
field (@property)
global (@property)
global (instance methods)
other instance methods
486 def __init__(self, other, dataset): 487 '''AnaDfield is created by adding a AnaDataset link to an AnaField object. 488 489 *Parameters* 490 491 - **other** : AnaField or AnaDfield to initialize attributes 492 - **dataset** : AnaDataset which includes the AnaDfield 493 ''' 494 self.dataset = dataset
AnaDfield is created by adding a AnaDataset link to an AnaField object.
Parameters
- other : AnaField or AnaDfield to initialize attributes
- dataset : AnaDataset which includes the AnaDfield
613 def to_dict(self, mode='id'): 614 '''return a dict with field attributes. 615 616 *Parameters* 617 618 - **mode** : str (default 'id') - AnaDfield representation ('field', 'id', 'index') 619 ''' 620 dic = super().to_dict(full=True, notnone=False) 621 dic[DISTROOT] = self.dist_root 622 dic[NUM] = self.index 623 dic[CATEGORY] = self.category 624 dic[PDISTANCE] = self.p_distance.view(mode) 625 dic[PDISTOMIN] = self.p_distomin.view(mode) 626 dic[PDERIVED] = self.p_derived.view(mode) 627 return dic
return a dict with field attributes.
Parameters
- mode : str (default 'id') - AnaDfield representation ('field', 'id', 'index')
629 def view(self, mode='field'): 630 ''' return a representation of the AnaDfield 631 632 *Parameters* 633 634 - **mode** : str (default 'field') - AnaDfield representation ('field', 'id', 'index') 635 ''' 636 return Util.view(self, mode)
return a representation of the AnaDfield
Parameters
- mode : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
638 def ascendants(self, typeparent='derived', mode='field'): 639 ''' return the list of the AnaDfield's ascendants in the family tree up to 640 the root AnaDfield. 641 642 *Parameters* 643 644 - **typeparent** : str (default 'derived') - 'derived', 'distance' or 'distomin' 645 - **mode** : str (default 'field') - AnaDfield representation 646 ('field', 'id', 'index') 647 648 *Returns* : list of parents from closest to the most distant. Parents 649 are represented with index, idfield, or object 650 ''' 651 parent = self 652 listparent = [] 653 while parent != self.dataset.root: 654 if typeparent == 'derived': 655 parent = parent.p_derived 656 elif typeparent == 'distance': 657 parent = parent.p_distance 658 else: 659 parent = parent.p_distomin 660 if parent != self.dataset.root: 661 listparent.append(parent) 662 return Util.view(listparent, mode)
return the list of the AnaDfield's ascendants in the family tree up to the root AnaDfield.
Parameters
- typeparent : str (default 'derived') - 'derived', 'distance' or 'distomin'
- mode : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
Returns : list of parents from closest to the most distant. Parents are represented with index, idfield, or object
664 def dic_inner_node(self, mode, lname): 665 '''return a child AnaDfield tree. 666 667 *Parameters* 668 669 - **lname** : integer - maximal length of the names 670 - **mode** : string (default 'derived') - kind of tree : 671 'derived' : derived tree 672 'distance': min distance tree 673 'distomin': min distomin tree 674 675 *Returns* : dict where key is a AnaDfield and value is the list of 676 the childs. 677 ''' 678 adding = '' 679 if mode == 'distance': 680 rel_parent = self.dataset.get_relation(self, self.p_distance) 681 adding = str(rel_parent.distance) + ' - ' 682 elif mode == 'distomin': 683 rel_parent = self.dataset.get_relation(self, self.p_distomin) 684 adding = str(rel_parent.distomin) + ' - ' 685 elif mode == 'derived': 686 rel_parent = self.dataset.get_relation(self, self.p_derived) 687 adding = str(rel_parent.distance) + ' - ' 688 adding += str(self.lencodec) 689 name = self.idfield[:lname] + ' (' + adding + ')' 690 lis = [name.replace(' ', '*').replace("'", '*')] 691 if mode == 'derived': 692 childs = [] 693 if not self.category in (ROOTED, COUPLED): 694 for rel in self.list_coupled: 695 lis.append(rel.relation[1].dic_inner_node(mode, lname)) 696 if not self.category in (ROOTED, UNIQUE): 697 childs = [rel.relation[1] for rel in self.list_relations 698 if rel.relation[1].p_derived == self and 699 rel.relation[1].category != COUPLED] 700 if mode == 'distomin': 701 childs = [rel.relation[1] for rel in self.list_relations 702 if rel.relation[1].p_distomin == self] 703 if mode == 'distance': 704 childs = [rel.relation[1] for rel in self.list_relations 705 if rel.relation[1].p_distance == self] 706 for fld in childs: 707 lis.append(fld.dic_inner_node(mode, lname)) 708 return {str(self.index).ljust(2, '*'): lis}
return a child AnaDfield tree.
Parameters
- lname : integer - maximal length of the names
- mode : string (default 'derived') - kind of tree : 'derived' : derived tree 'distance': min distance tree 'distomin': min distomin tree
Returns : dict where key is a AnaDfield and value is the list of the childs.
711class AnaDataset: 712 '''This class analyses the structure of a dataset. 713 714 *Attributes* : 715 716 - **iddataset** : string or integer - Id of the Dataset 717 - **fields** : list of the AnaDfields included 718 - **relations** : dict of the AnaRelations between two AnaDfields 719 - **hashd** : string - update identifier 720 721 *relationship (@property)* 722 723 - `ana_relations` 724 - `p_relations` 725 726 *field (@property)* 727 728 - `root` 729 - `primary` 730 - `secondary` 731 - `unique` 732 - `variable` 733 734 *global (@property)* 735 736 - `category` 737 - `complete` 738 - `dimension` 739 740 *update (instance methods)* 741 742 - `set_relations` 743 744 745 *access (instance methods)* 746 747 - `get_relation` 748 - `dfield` 749 750 *synthesis (instance methods)* 751 752 - `tree` 753 - `to_dict` 754 - `indicator` 755 - `partitions` 756 - `field_partition` 757 ''' 758 759 def __init__(self, fields=None, relations=None, iddataset=None, 760 leng=None, hashd=None): 761 '''Creation mode : 762 - single dict attribute where keys are attributes name, 763 - single AnaDataset attribute to make a copy 764 - multiple attributes 765 766 *Parameters (multiple attributes)* 767 768 - **idfield** : string or integer - Id of the Field 769 - **lencodec** : integer (default None) - length of the codec 770 - **mincodec** : integer (default None) - number of different values 771 - **maxcodec** : integer (default None) - length of the field 772 - **hashf** : string (default None) - update identifier 773 ''' 774 if isinstance(fields, AnaDataset): 775 self.iddataset = fields.iddataset 776 self.fields = fields.fields 777 self.relations = fields.relations 778 self.hashd = fields.hashd 779 return 780 if isinstance(fields, dict): 781 iddataset = fields.get(IDDATASET, None) 782 leng = fields.get(LENGTH, None) 783 relations = fields.get(RELATIONS, None) 784 hashd = fields.get(HASHD) 785 fields = fields.get(FIELDS, None) 786 self.iddataset = iddataset 787 self.fields = [AnaDfield(AnaField(field), self) 788 for field in fields] if fields else [] 789 if leng: 790 for fld in self.fields: 791 fld.maxcodec = leng 792 self.relations = {field: {} for field in self.fields} 793 if relations: 794 for fld, dic_relation in relations.items(): 795 self.set_relations(fld, dic_relation) 796 self.hashd = hashd 797 798 def __len__(self): 799 '''length of the AnaDataset (len of the AnaDfields included)''' 800 return max(len(fld) for fld in self.fields) 801 802 def __eq__(self, other): 803 ''' equal if class and values are equal''' 804 return self.__class__ .__name__ == other.__class__.__name__ and \ 805 self.fields == other.fields and self.relations == other.relations and \ 806 self.iddataset == other.iddataset and self.hashd == other.hashd 807 808 def __hash__(self): 809 '''return hash value (sum of attributes hash)''' 810 return hash(self.iddataset) + sum(hash(fld) for fld in self.fields) + \ 811 sum(hash(rel) for rel in self.relations) + hash(self.hashd) 812 813 @property 814 def category(self): 815 '''return a list of AnaDfield category (unique, rooted, coupled, derived, mixed)''' 816 return [fld.category for fld in self.fields] 817 818 @property 819 def ana_relations(self): 820 '''return the list of AnaRelation included''' 821 return [rel for fldrel in self.relations.values() for rel in fldrel.values()] 822 823 @property 824 def p_relations(self): 825 '''return the list of oriented AnaRelation (parent first, child second)''' 826 return [rel for rel in self.ana_relations if rel.parent_child] 827 828 @property 829 def root(self): 830 '''return the root AnaDfield''' 831 len_self = len(self) 832 return AnaDfield(AnaField(ROOT, len_self, len_self, len_self), self) 833 834 @property 835 def primary(self): 836 '''return the first partition of the partitions''' 837 part = self.partitions(distributed=True) 838 return part[0] if part else [] 839 840 @property 841 def complete(self): 842 '''return True if the dimension is not 0''' 843 return self.dimension > 0 844 845 @property 846 def dimension(self): 847 '''return the highest partition lenght''' 848 return len(self.primary) 849 850 @property 851 def secondary(self): 852 '''return the derived ou coupled fields from primary''' 853 secondary = [] 854 for field in self.primary: 855 self._add_child(field, secondary) 856 return [fld for fld in secondary if not fld in self.primary] 857 858 @property 859 def unique(self): 860 '''return the unique fields''' 861 return [fld for fld in self.fields if fld.category == UNIQUE] 862 863 @property 864 def variable(self): 865 '''return the variable fields''' 866 return [fld for fld in self.fields 867 if not fld in self.primary + self.secondary + self.unique] 868 869 def set_relations(self, field, dic_relations): 870 '''Add relations in the AnaDataset from a dict. 871 872 *Parameters* 873 874 - **field** : AnaDfield, AnaField or str (idfield) - first relation AnaDfield 875 - **dic_relations** : dict - key is the second relation AnaDfield and 876 value is the dist value or teh list [dist, distrib] 877 ''' 878 fld = self.dfield(field) 879 for other, dist in dic_relations.items(): 880 oth = self.dfield(other) 881 self.relations[fld][oth] = AnaRelation([fld, oth], dist) 882 self.relations[oth][fld] = AnaRelation([oth, fld], dist) 883 884 def get_relation(self, fld1, fld2): 885 '''Return AnaRelation between fld1 and fld2. 886 887 *Parameters* 888 889 - **fld1** : AnaDfield, AnaField, int or str (idfield) - first relation AnaDfield 890 - **fld2** : AnaDfield, AnaField, int or str (idfield) - second relation AnaDfield 891 ''' 892 fl1 = self.dfield(fld1) 893 fl2 = self.dfield(fld2) 894 if self.root in [fl1, fl2]: 895 return AnaRelation([fl1, fl2], len(self)) 896 return self.relations[self.dfield(fld1)][self.dfield(fld2)] 897 898 def dfield(self, fld): 899 '''return the AnaDfield matching with fld. Fld is str, int, AnaDfield or AnaField''' 900 if fld in (-1, ROOT): 901 return self.root 902 if isinstance(fld, AnaDfield): 903 return fld 904 if isinstance(fld, int): 905 return self.fields[fld] 906 if isinstance(fld, str): 907 if fld in [dfld.idfield for dfld in self.fields]: 908 return [dfld for dfld in self.fields if dfld.idfield == fld][0] 909 # return self.root 910 return None 911 return AnaDfield(fld, self) 912 913 def tree(self, mode='derived', width=5, lname=20, string=True): 914 '''return a string with a tree of derived Field. 915 916 *Parameters* 917 918 - **lname** : integer (default 20) - length of the names 919 - **width** : integer (default 5) - length of the lines 920 - **string** : boolean (default True) - if True return str else return dict 921 - **mode** : string (default 'derived') - kind of tree : 922 'derived' : derived tree 923 'distance': min distance tree 924 'distomin': min distomin tree 925 ''' 926 lis = ['root-' + mode + '*(' + str(len(self)) + ')'] 927 if mode == 'distance': 928 childs = [fld for fld in self.fields if fld.p_distance == self.root] 929 elif mode == 'distomin': 930 childs = [fld for fld in self.fields if fld.p_distomin == self.root] 931 elif mode == 'derived': 932 childs = [fld for fld in self.fields if fld.p_derived == self.root] 933 for fld in childs: 934 lis.append(fld.dic_inner_node(mode, lname)) 935 tree = {str(-1).ljust(2, '*'): lis} 936 if string: 937 tre = pprint.pformat(tree, indent=0, width=width) 938 tre = tre.replace('---', ' - ') 939 tre = tre.replace(' ', ' ') 940 tre = tre.replace('*', ' ') 941 for car in ["'", "\"", "{", "[", "]", "}", ","]: 942 tre = tre.replace(car, "") 943 return tre 944 return Util.clean_dic(tree, '*', ' ') 945 946 def to_dict(self, mode='field', keys=None, relations=False): 947 '''return a dict with fields attributes and optionaly relations attributes. 948 949 *Parameters* 950 951 - **mode** : str (default 'field') - AnaDfield representation 952 ('field', 'id', 'index') 953 - **relations** : boolean (default: False) - if False return a list of fields, 954 if True return a dict '{"fields": <list of fields>, "relations": <list of relations>}' 955 - **keys** : string, list or tuple - list of keys or single key to return 956 if 'all' or None, all keys are returned 957 if list, only keys in list are returned 958 if string, only values associated to the string(key) are returned''' 959 fields = Util.filter_dic([fld.to_dict(mode=mode) 960 for fld in self.fields], keys) 961 leng = len(self.fields) 962 if not relations: 963 return fields 964 return {'fields': fields, 'relations': 965 [self.get_relation(i, j).to_dict(full=True, mode=mode) 966 for i in range(-1, leng) for j in range(i + 1, leng)]} 967 968 def partitions(self, mode='field', distributed=True): 969 '''return a list of available partitions (the first is highest). 970 971 *Parameters* 972 973 - **mode** : str (default 'field') - AnaDfield representation 974 ('field', 'id', 'index') 975 - **distributed** : boolean (default True) - Include only distributed fields 976 ''' 977 partit = [[fld] for fld in self.fields if fld.category == ROOTED] 978 crossed = [rel for rel in self.ana_relations if rel.typecoupl == CROSSED 979 # and rel.relation[1].index > rel.relation[0].index 980 and rel.parent_child 981 and rel.relation[0].category != COUPLED 982 and rel.relation[1].category != COUPLED] 983 if distributed: 984 crossed = [rel for rel in crossed if rel.distrib] 985 if crossed and len(crossed) == 1 and crossed[0].dist == len(self): 986 partit.insert(0, crossed[0].relation) 987 elif crossed: 988 for repeat in list(range(len(crossed))): 989 candidates = combinations(crossed, repeat + 1) 990 for candidat in candidates: 991 flds = list(set(rel.relation[i] 992 for rel in candidat for i in [0, 1])) 993 if (reduce(mul, [fld.lencodec for fld in flds]) == len(self) and 994 len(candidat) == sum(range(len(flds))) and 995 (not distributed or min(rel.distrib for rel in candidat))): 996 partit.insert(0, flds) 997 partit = Util.view(partit, mode) 998 return [list(tup) for tup in 999 sorted(sorted(list({tuple(sorted(prt)) for prt in partit})), 1000 key=len, reverse=True)] 1001 1002 def field_partition(self, mode='field', partition=None, distributed=True): 1003 '''return a partition dict with the list of primary, secondary, unique 1004 and variable fields. 1005 1006 *Parameters* 1007 1008 - **mode** : str (default 'field') - AnaDfield representation 1009 ('field', 'id', 'index') 1010 - **partition** : list (default None) - if None, partition is the first 1011 - **distributed** : boolean (default True) - Include only distributed fields 1012 ''' 1013 if not partition: 1014 partitions = self.partitions(distributed=distributed) 1015 if not partitions: 1016 return {'primary': [], 'secondary': [], 'unique': [], 'variable': []} 1017 partition = partitions[0] 1018 else: 1019 partition = [self.dfield(fld) for fld in partition] 1020 secondary = [] 1021 for field in partition: 1022 self._add_child(field, secondary) 1023 secondary = [fld for fld in secondary if not fld in partition] 1024 unique = [fld for fld in self.fields if fld.category == UNIQUE] 1025 variable = [fld for fld in self.fields 1026 if not fld in partition + secondary + unique] 1027 return Util.view({'primary': partition, 'secondary': secondary, 1028 'unique': unique, 'variable': variable}, mode) 1029 1030 def indicator(self, fullsize, size): 1031 '''generate size indicators: ol (object lightness), ul (unicity level), 1032 gain (sizegain) 1033 1034 *Parameters* 1035 1036 - **fullsize** : int - size with full codec 1037 - **size** : int - size with existing codec 1038 1039 *Returns* : dict''' 1040 lenindex = len(self.fields) 1041 indexlen = sum(fld.lencodec for fld in self.fields) 1042 nval = len(self) * (lenindex + 1) 1043 sval = fullsize / nval 1044 ncod = indexlen + lenindex 1045 1046 if nval != ncod: 1047 scod = (size - ncod * sval) / (nval - ncod) 1048 olight = scod / sval 1049 else: 1050 olight = None 1051 return {'total values': nval, 'mean size': round(sval, 3), 1052 'unique values': ncod, 'mean coding size': round(scod, 3), 1053 'unicity level': round(ncod / nval, 3), 1054 'optimize level': round(size / fullsize, 3), 1055 'object lightness': round(olight, 3), 1056 'maxgain': round((nval - ncod) / nval, 3), 1057 'gain': round((fullsize - size) / fullsize, 3)} 1058 1059 def _add_child(self, field, childs): 1060 ''' add derived or coupled fields in the childs list''' 1061 for rel in field.list_c_derived + field.list_coupled: 1062 child = rel.relation[1] 1063 if not child in childs and not child.category == UNIQUE: 1064 childs.append(child) 1065 if not child.category in (COUPLED, UNIQUE): 1066 self._add_child(child, childs)
This class analyses the structure of a dataset.
Attributes :
- iddataset : string or integer - Id of the Dataset
- fields : list of the AnaDfields included
- relations : dict of the AnaRelations between two AnaDfields
- hashd : string - update identifier
relationship (@property)
field (@property)
global (@property)
update (instance methods)
access (instance methods)
synthesis (instance methods)
759 def __init__(self, fields=None, relations=None, iddataset=None, 760 leng=None, hashd=None): 761 '''Creation mode : 762 - single dict attribute where keys are attributes name, 763 - single AnaDataset attribute to make a copy 764 - multiple attributes 765 766 *Parameters (multiple attributes)* 767 768 - **idfield** : string or integer - Id of the Field 769 - **lencodec** : integer (default None) - length of the codec 770 - **mincodec** : integer (default None) - number of different values 771 - **maxcodec** : integer (default None) - length of the field 772 - **hashf** : string (default None) - update identifier 773 ''' 774 if isinstance(fields, AnaDataset): 775 self.iddataset = fields.iddataset 776 self.fields = fields.fields 777 self.relations = fields.relations 778 self.hashd = fields.hashd 779 return 780 if isinstance(fields, dict): 781 iddataset = fields.get(IDDATASET, None) 782 leng = fields.get(LENGTH, None) 783 relations = fields.get(RELATIONS, None) 784 hashd = fields.get(HASHD) 785 fields = fields.get(FIELDS, None) 786 self.iddataset = iddataset 787 self.fields = [AnaDfield(AnaField(field), self) 788 for field in fields] if fields else [] 789 if leng: 790 for fld in self.fields: 791 fld.maxcodec = leng 792 self.relations = {field: {} for field in self.fields} 793 if relations: 794 for fld, dic_relation in relations.items(): 795 self.set_relations(fld, dic_relation) 796 self.hashd = hashd
Creation mode :
- single dict attribute where keys are attributes name,
- single AnaDataset attribute to make a copy
- multiple attributes
Parameters (multiple attributes)
- idfield : string or integer - Id of the Field
- lencodec : integer (default None) - length of the codec
- mincodec : integer (default None) - number of different values
- maxcodec : integer (default None) - length of the field
- hashf : string (default None) - update identifier
869 def set_relations(self, field, dic_relations): 870 '''Add relations in the AnaDataset from a dict. 871 872 *Parameters* 873 874 - **field** : AnaDfield, AnaField or str (idfield) - first relation AnaDfield 875 - **dic_relations** : dict - key is the second relation AnaDfield and 876 value is the dist value or teh list [dist, distrib] 877 ''' 878 fld = self.dfield(field) 879 for other, dist in dic_relations.items(): 880 oth = self.dfield(other) 881 self.relations[fld][oth] = AnaRelation([fld, oth], dist) 882 self.relations[oth][fld] = AnaRelation([oth, fld], dist)
Add relations in the AnaDataset from a dict.
Parameters
- field : AnaDfield, AnaField or str (idfield) - first relation AnaDfield
- dic_relations : dict - key is the second relation AnaDfield and value is the dist value or teh list [dist, distrib]
884 def get_relation(self, fld1, fld2): 885 '''Return AnaRelation between fld1 and fld2. 886 887 *Parameters* 888 889 - **fld1** : AnaDfield, AnaField, int or str (idfield) - first relation AnaDfield 890 - **fld2** : AnaDfield, AnaField, int or str (idfield) - second relation AnaDfield 891 ''' 892 fl1 = self.dfield(fld1) 893 fl2 = self.dfield(fld2) 894 if self.root in [fl1, fl2]: 895 return AnaRelation([fl1, fl2], len(self)) 896 return self.relations[self.dfield(fld1)][self.dfield(fld2)]
Return AnaRelation between fld1 and fld2.
Parameters
- fld1 : AnaDfield, AnaField, int or str (idfield) - first relation AnaDfield
- fld2 : AnaDfield, AnaField, int or str (idfield) - second relation AnaDfield
898 def dfield(self, fld): 899 '''return the AnaDfield matching with fld. Fld is str, int, AnaDfield or AnaField''' 900 if fld in (-1, ROOT): 901 return self.root 902 if isinstance(fld, AnaDfield): 903 return fld 904 if isinstance(fld, int): 905 return self.fields[fld] 906 if isinstance(fld, str): 907 if fld in [dfld.idfield for dfld in self.fields]: 908 return [dfld for dfld in self.fields if dfld.idfield == fld][0] 909 # return self.root 910 return None 911 return AnaDfield(fld, self)
return the AnaDfield matching with fld. Fld is str, int, AnaDfield or AnaField
913 def tree(self, mode='derived', width=5, lname=20, string=True): 914 '''return a string with a tree of derived Field. 915 916 *Parameters* 917 918 - **lname** : integer (default 20) - length of the names 919 - **width** : integer (default 5) - length of the lines 920 - **string** : boolean (default True) - if True return str else return dict 921 - **mode** : string (default 'derived') - kind of tree : 922 'derived' : derived tree 923 'distance': min distance tree 924 'distomin': min distomin tree 925 ''' 926 lis = ['root-' + mode + '*(' + str(len(self)) + ')'] 927 if mode == 'distance': 928 childs = [fld for fld in self.fields if fld.p_distance == self.root] 929 elif mode == 'distomin': 930 childs = [fld for fld in self.fields if fld.p_distomin == self.root] 931 elif mode == 'derived': 932 childs = [fld for fld in self.fields if fld.p_derived == self.root] 933 for fld in childs: 934 lis.append(fld.dic_inner_node(mode, lname)) 935 tree = {str(-1).ljust(2, '*'): lis} 936 if string: 937 tre = pprint.pformat(tree, indent=0, width=width) 938 tre = tre.replace('---', ' - ') 939 tre = tre.replace(' ', ' ') 940 tre = tre.replace('*', ' ') 941 for car in ["'", "\"", "{", "[", "]", "}", ","]: 942 tre = tre.replace(car, "") 943 return tre 944 return Util.clean_dic(tree, '*', ' ')
return a string with a tree of derived Field.
Parameters
- lname : integer (default 20) - length of the names
- width : integer (default 5) - length of the lines
- string : boolean (default True) - if True return str else return dict
- mode : string (default 'derived') - kind of tree : 'derived' : derived tree 'distance': min distance tree 'distomin': min distomin tree
946 def to_dict(self, mode='field', keys=None, relations=False): 947 '''return a dict with fields attributes and optionaly relations attributes. 948 949 *Parameters* 950 951 - **mode** : str (default 'field') - AnaDfield representation 952 ('field', 'id', 'index') 953 - **relations** : boolean (default: False) - if False return a list of fields, 954 if True return a dict '{"fields": <list of fields>, "relations": <list of relations>}' 955 - **keys** : string, list or tuple - list of keys or single key to return 956 if 'all' or None, all keys are returned 957 if list, only keys in list are returned 958 if string, only values associated to the string(key) are returned''' 959 fields = Util.filter_dic([fld.to_dict(mode=mode) 960 for fld in self.fields], keys) 961 leng = len(self.fields) 962 if not relations: 963 return fields 964 return {'fields': fields, 'relations': 965 [self.get_relation(i, j).to_dict(full=True, mode=mode) 966 for i in range(-1, leng) for j in range(i + 1, leng)]}
return a dict with fields attributes and optionaly relations attributes.
Parameters
- mode : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
- relations : boolean (default: False) - if False return a list of fields,
if True return a dict '{"fields":
- , "relations":
- }'
- keys : string, list or tuple - list of keys or single key to return if 'all' or None, all keys are returned if list, only keys in list are returned if string, only values associated to the string(key) are returned
968 def partitions(self, mode='field', distributed=True): 969 '''return a list of available partitions (the first is highest). 970 971 *Parameters* 972 973 - **mode** : str (default 'field') - AnaDfield representation 974 ('field', 'id', 'index') 975 - **distributed** : boolean (default True) - Include only distributed fields 976 ''' 977 partit = [[fld] for fld in self.fields if fld.category == ROOTED] 978 crossed = [rel for rel in self.ana_relations if rel.typecoupl == CROSSED 979 # and rel.relation[1].index > rel.relation[0].index 980 and rel.parent_child 981 and rel.relation[0].category != COUPLED 982 and rel.relation[1].category != COUPLED] 983 if distributed: 984 crossed = [rel for rel in crossed if rel.distrib] 985 if crossed and len(crossed) == 1 and crossed[0].dist == len(self): 986 partit.insert(0, crossed[0].relation) 987 elif crossed: 988 for repeat in list(range(len(crossed))): 989 candidates = combinations(crossed, repeat + 1) 990 for candidat in candidates: 991 flds = list(set(rel.relation[i] 992 for rel in candidat for i in [0, 1])) 993 if (reduce(mul, [fld.lencodec for fld in flds]) == len(self) and 994 len(candidat) == sum(range(len(flds))) and 995 (not distributed or min(rel.distrib for rel in candidat))): 996 partit.insert(0, flds) 997 partit = Util.view(partit, mode) 998 return [list(tup) for tup in 999 sorted(sorted(list({tuple(sorted(prt)) for prt in partit})), 1000 key=len, reverse=True)]
return a list of available partitions (the first is highest).
Parameters
- mode : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
- distributed : boolean (default True) - Include only distributed fields
1002 def field_partition(self, mode='field', partition=None, distributed=True): 1003 '''return a partition dict with the list of primary, secondary, unique 1004 and variable fields. 1005 1006 *Parameters* 1007 1008 - **mode** : str (default 'field') - AnaDfield representation 1009 ('field', 'id', 'index') 1010 - **partition** : list (default None) - if None, partition is the first 1011 - **distributed** : boolean (default True) - Include only distributed fields 1012 ''' 1013 if not partition: 1014 partitions = self.partitions(distributed=distributed) 1015 if not partitions: 1016 return {'primary': [], 'secondary': [], 'unique': [], 'variable': []} 1017 partition = partitions[0] 1018 else: 1019 partition = [self.dfield(fld) for fld in partition] 1020 secondary = [] 1021 for field in partition: 1022 self._add_child(field, secondary) 1023 secondary = [fld for fld in secondary if not fld in partition] 1024 unique = [fld for fld in self.fields if fld.category == UNIQUE] 1025 variable = [fld for fld in self.fields 1026 if not fld in partition + secondary + unique] 1027 return Util.view({'primary': partition, 'secondary': secondary, 1028 'unique': unique, 'variable': variable}, mode)
return a partition dict with the list of primary, secondary, unique and variable fields.
Parameters
- mode : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
- partition : list (default None) - if None, partition is the first
- distributed : boolean (default True) - Include only distributed fields
1030 def indicator(self, fullsize, size): 1031 '''generate size indicators: ol (object lightness), ul (unicity level), 1032 gain (sizegain) 1033 1034 *Parameters* 1035 1036 - **fullsize** : int - size with full codec 1037 - **size** : int - size with existing codec 1038 1039 *Returns* : dict''' 1040 lenindex = len(self.fields) 1041 indexlen = sum(fld.lencodec for fld in self.fields) 1042 nval = len(self) * (lenindex + 1) 1043 sval = fullsize / nval 1044 ncod = indexlen + lenindex 1045 1046 if nval != ncod: 1047 scod = (size - ncod * sval) / (nval - ncod) 1048 olight = scod / sval 1049 else: 1050 olight = None 1051 return {'total values': nval, 'mean size': round(sval, 3), 1052 'unique values': ncod, 'mean coding size': round(scod, 3), 1053 'unicity level': round(ncod / nval, 3), 1054 'optimize level': round(size / fullsize, 3), 1055 'object lightness': round(olight, 3), 1056 'maxgain': round((nval - ncod) / nval, 3), 1057 'gain': round((fullsize - size) / fullsize, 3)}
generate size indicators: ol (object lightness), ul (unicity level), gain (sizegain)
Parameters
- fullsize : int - size with full codec
- size : int - size with existing codec
Returns : dict
1069class Util: 1070 ''' common functions for analysis package''' 1071 1072 @staticmethod 1073 def view(field_struc, mode): 1074 ''' return a representation of a AnaDfields structure (fields, id, index). 1075 1076 *Parameters* 1077 1078 - **mode** : str - AnaDfield representation ('field', 'id', 'index') 1079 - **field_struc** : list or dict - structure to represent 1080 ''' 1081 if mode is None or mode == 'field' or not field_struc: 1082 return field_struc 1083 if isinstance(field_struc, dict): 1084 return {key: [fld.idfield if mode == 'id' else fld.index for fld in val] 1085 for key, val in field_struc.items()} 1086 if isinstance(field_struc, list) and isinstance(field_struc[0], list): 1087 return [[fld.idfield if mode == 'id' else fld.index for fld in val] 1088 for val in field_struc] 1089 if isinstance(field_struc, list): 1090 return [fld.idfield if mode == 'id' else fld.index for fld in field_struc] 1091 if isinstance(field_struc, AnaField): 1092 return field_struc.idfield if mode == 'id' else field_struc.index 1093 return field_struc 1094 1095 @staticmethod 1096 def reduce_dic(obj): 1097 '''return a dict without empty or None values''' 1098 return {key: val for key, val in obj.items() if not val is None} 1099 1100 @staticmethod 1101 def clean_dic(obj, old, new): 1102 '''return a dict or list with updated strings by replacing "old" substring 1103 with "new" substring''' 1104 if isinstance(obj, dict): 1105 return {Util.clean_dic(key, old, new): Util.clean_dic(val, old, new) 1106 for key, val in obj.items()} 1107 if isinstance(obj, str): 1108 return obj.replace(old, new) 1109 if isinstance(obj, list): 1110 return [Util.clean_dic(val, old, new) for val in obj] 1111 return obj 1112 1113 @staticmethod 1114 def filter_dic(obj, keys): 1115 '''return extract of a list of dict or of a dict 1116 1117 *Parameters* 1118 1119 - **keys** : string, list or tuple - list of keys or single key to return 1120 if 'all' or None, all keys are returned 1121 if list, only keys in list are returned 1122 if string, only values associated to the string(key) are returned''' 1123 if not keys or keys == 'all': 1124 return obj 1125 if isinstance(obj, list): 1126 return [Util.filter_dic(dic, keys) for dic in obj] 1127 if isinstance(keys, str) and isinstance(obj, dict): 1128 return obj.get(keys, None) 1129 if isinstance(keys, (list, tuple)) and isinstance(obj, dict): 1130 return {key: val for key, val in obj.items() if key in keys} 1131 return obj
common functions for analysis package
1072 @staticmethod 1073 def view(field_struc, mode): 1074 ''' return a representation of a AnaDfields structure (fields, id, index). 1075 1076 *Parameters* 1077 1078 - **mode** : str - AnaDfield representation ('field', 'id', 'index') 1079 - **field_struc** : list or dict - structure to represent 1080 ''' 1081 if mode is None or mode == 'field' or not field_struc: 1082 return field_struc 1083 if isinstance(field_struc, dict): 1084 return {key: [fld.idfield if mode == 'id' else fld.index for fld in val] 1085 for key, val in field_struc.items()} 1086 if isinstance(field_struc, list) and isinstance(field_struc[0], list): 1087 return [[fld.idfield if mode == 'id' else fld.index for fld in val] 1088 for val in field_struc] 1089 if isinstance(field_struc, list): 1090 return [fld.idfield if mode == 'id' else fld.index for fld in field_struc] 1091 if isinstance(field_struc, AnaField): 1092 return field_struc.idfield if mode == 'id' else field_struc.index 1093 return field_struc
return a representation of a AnaDfields structure (fields, id, index).
Parameters
- mode : str - AnaDfield representation ('field', 'id', 'index')
- field_struc : list or dict - structure to represent
1095 @staticmethod 1096 def reduce_dic(obj): 1097 '''return a dict without empty or None values''' 1098 return {key: val for key, val in obj.items() if not val is None}
return a dict without empty or None values
1100 @staticmethod 1101 def clean_dic(obj, old, new): 1102 '''return a dict or list with updated strings by replacing "old" substring 1103 with "new" substring''' 1104 if isinstance(obj, dict): 1105 return {Util.clean_dic(key, old, new): Util.clean_dic(val, old, new) 1106 for key, val in obj.items()} 1107 if isinstance(obj, str): 1108 return obj.replace(old, new) 1109 if isinstance(obj, list): 1110 return [Util.clean_dic(val, old, new) for val in obj] 1111 return obj
return a dict or list with updated strings by replacing "old" substring with "new" substring
1113 @staticmethod 1114 def filter_dic(obj, keys): 1115 '''return extract of a list of dict or of a dict 1116 1117 *Parameters* 1118 1119 - **keys** : string, list or tuple - list of keys or single key to return 1120 if 'all' or None, all keys are returned 1121 if list, only keys in list are returned 1122 if string, only values associated to the string(key) are returned''' 1123 if not keys or keys == 'all': 1124 return obj 1125 if isinstance(obj, list): 1126 return [Util.filter_dic(dic, keys) for dic in obj] 1127 if isinstance(keys, str) and isinstance(obj, dict): 1128 return obj.get(keys, None) 1129 if isinstance(keys, (list, tuple)) and isinstance(obj, dict): 1130 return {key: val for key, val in obj.items() if key in keys} 1131 return obj
return extract of a list of dict or of a dict
Parameters
- keys : string, list or tuple - list of keys or single key to return if 'all' or None, all keys are returned if list, only keys in list are returned if string, only values associated to the string(key) are returned
Analysis Exception
Inherited Members
- builtins.Exception
- Exception
- builtins.BaseException
- with_traceback