tab-analysis.tab_analysis.analysis
This module analyses structure and relationships included in a tabular object (Pandas DataFrame, Dataset, list of list) :
- Structure of a single field (class
AnaField
), - Relationship between two fields (class
AnaRelation
) - Structure and relationships of fields inside a dataset (class
AnaDfield
) - Structure of a dataset (class
AnaDataset
)
1# -*- coding: utf-8 -*- 2""" 3This module analyses structure and relationships included in a tabular object 4(Pandas DataFrame, Dataset, list of list) : 5- Structure of a single field (class `AnaField`), 6- Relationship between two fields (class `AnaRelation`) 7- Structure and relationships of fields inside a dataset (class `AnaDfield`) 8- Structure of a dataset (class `AnaDataset`) 9 10It contains two another classes `Util`, `AnaError`. 11""" 12import json 13import pprint 14from itertools import combinations 15from operator import mul 16from functools import reduce 17 18NULL = 'null' 19UNIQUE = 'unique' 20COMPLETE = 'complete' 21FULL = 'full' 22DEFAULT = 'default' 23MIXED = 'mixed' 24 25COUPLED = 'coupled' 26DERIVED = 'derived' 27LINKED = 'linked' 28CROSSED = 'crossed' 29DISTRIBUTED = 'distributed' 30ROOTED = 'rooted' 31ROOT = 'root' 32 33IDFIELD = 'id' 34MINCODEC = 'mincodec' 35MAXCODEC = 'maxcodec' 36LENCODEC = 'lencodec' 37RATECODEC = 'ratecodec' 38DMINCODEC = 'dmincodec' 39DMAXCODEC = 'dmaxcodec' 40RANCODEC = 'rancodec' 41TYPECODEC = 'typecodec' 42HASHF = 'hashf' 43RELATION = 'relation' 44HASHR = 'hashr' 45DIST = 'dist' 46DMAX = 'dmax' 47DMIN = 'dmin' 48DIFF = 'diff' 49DRAN = 'dran' 50NUM = 'num' 51CATEGORY = 'category' 52PDERIVED = 'pderived' 53PDISTANCE = 'pdistance' 54PDISTOMIN = 'pdistomin' 55DISDISTANCE = 'disdistance' 56DERDISTANCE = 'derdistance' 57DISRATECPL = 'disratecpl' 58DERRATECPL = 'derratecpl' 59DISRATEDER = 'disrateder' 60DERRATEDER = 'derrateder' 61 62TYPECOUPL = 'typecoupl' 63PARENTCHILD = 'parentchild' 64DISTANCE = 'distance' 65DISTOMIN = 'distomin' 66DISTOMAX = 'distomax' 67DISTROOT = 'distroot' 68RATECPL = 'ratecpl' 69RATEDER = 'rateder' 70 71IDDATASET = 'name' 72RELATIONS = 'relations' 73FIELDS = 'fields' 74LENGTH = 'length' 75HASHD = 'hashd' 76 77 78class AnaField: 79 '''This class analyses field entities. 80 81 *Attributes* 82 83 - **idfield** : string - name or Id of the field 84 - **lencodec**: integer - codec length 85 - **mincodec**: integer - minimal codec length 86 - **maxcodec**: integer - minimal codec length 87 - **hashf**: integer - hash value to identify modifications 88 89 *characteristic (@property)* 90 91 - `iscomplete` 92 - `ratecodec` 93 - `dmincodec` 94 - `dmaxcodec` 95 - `rancodec` 96 - `typecodec` 97 98 *instance methods* 99 100 - `to_dict` 101 102 ''' 103 104 def __init__(self, idfield, lencodec=None, mincodec=None, maxcodec=None, hashf=None): 105 '''Creation mode : 106 - single dict attribute where keys are attributes name, 107 - single AnaField attribute to make a copy 108 - multiple attributes 109 110 *Parameters (multiple attributes)* 111 112 - **idfield** : string or integer - Id of the Field 113 - **lencodec** : integer (default None) - length of the codec 114 - **mincodec** : integer (default None) - number of different values 115 - **maxcodec** : integer (default None) - length of the field 116 - **hashf** : string (default None) - update identifier 117 118 *example* 119 120 AnaField is created with a dict 121 >>> AnaField(Cfield([1,2,3,3]).to_analysis).to_dict() 122 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 123 >>> AnaField({'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}) 124 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 125 126 AnaField is created with parameters 127 >>> AnaField(lencodec=4, mincodec=3, maxcodec=4).to_dict() 128 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 129 >>> AnaField(4, 3, 4).to_dict() 130 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 131 ''' 132 if isinstance(idfield, dict): 133 self.idfield = idfield.get(IDFIELD, None) 134 self.lencodec = idfield.get(LENCODEC, None) 135 self.mincodec = idfield.get(MINCODEC, None) 136 self.maxcodec = idfield.get(MAXCODEC, None) 137 self.hashf = idfield.get(HASHF, None) 138 return 139 if isinstance(idfield, (AnaField, AnaDfield)): 140 self.idfield = idfield.idfield 141 self.lencodec = idfield.lencodec 142 self.mincodec = idfield.mincodec 143 self.maxcodec = idfield.maxcodec 144 self.hashf = idfield.hashf 145 return 146 if not lencodec or not isinstance(lencodec, int): 147 raise AnaError("lencodec is not correct") 148 self.idfield = idfield 149 self.lencodec = lencodec 150 self.mincodec = mincodec 151 self.maxcodec = maxcodec 152 self.hashf = hashf 153 154 def __len__(self): 155 '''length of the field (maxcodec)''' 156 return self.maxcodec if self.maxcodec else self.lencodec 157 158 def __repr__(self): 159 '''representation of the field (class name + idfield)''' 160 return self.__class__.__name__ + '(' + str(self.idfield) + ')' 161 162 def __eq__(self, other): 163 ''' equal if class and attributes are equal''' 164 return self.__class__ .__name__ == other.__class__.__name__ and \ 165 self.idfield == other.idfield and self.lencodec == other.lencodec and \ 166 self.mincodec == other.mincodec and self.maxcodec == other.maxcodec and \ 167 self.hashf == other.hashf 168 169 def __lt__(self, other): 170 ''' return a comparison between hash value''' 171 return hash(self) < hash(other) 172 173 def __hash__(self): 174 '''return hash value (sum of attributes hash)''' 175 return hash(self.idfield) + hash(self.lencodec) + hash(self.mincodec) \ 176 + hash(self.maxcodec) + hash(self.hashf) 177 178 def __str__(self): 179 '''json-text build with the attributes dict''' 180 return json.dumps(self.to_dict(idfield=True)) 181 182 def __copy__(self): 183 ''' Copy all the attributes ''' 184 return self.__class__(self) 185 186 def to_dict(self, full=False, idfield=False, notnone=True): 187 '''return a dict with field attributes. 188 189 *Parameters* 190 191 - **full** : boolean (default False) - if True, all the attributes are included 192 - **idfield** : boolean (default False) - if True, idfield is included 193 - **notnone** : boolean (default True) - if True, None values are not included 194 ''' 195 dic = {LENCODEC: self.lencodec, MINCODEC: self.mincodec, 196 MAXCODEC: self.maxcodec} 197 if idfield or full: 198 dic[IDFIELD] = self.idfield 199 if full: 200 dic |= {RATECODEC: self.ratecodec, DMINCODEC: self.dmincodec, 201 DMAXCODEC: self.dmaxcodec, RANCODEC: self.rancodec, 202 TYPECODEC: self.typecodec} 203 if notnone: 204 return Util.reduce_dic(dic) 205 return dic 206 207 @property 208 def iscomplete(self): 209 '''return boolean indicator : True if all attributes are present''' 210 return not self.maxcodec is None and not self.mincodec is None 211 212 @property 213 def ratecodec(self): 214 '''return float ratecodec indicator''' 215 if self.iscomplete and self.maxcodec - self.mincodec: 216 return (self.maxcodec - self.lencodec) / (self.maxcodec - self.mincodec) 217 return None 218 219 @property 220 def dmincodec(self): 221 '''return integer dmincodec indicator''' 222 return self.lencodec - self.mincodec if self.iscomplete else None 223 224 @property 225 def dmaxcodec(self): 226 '''return integer dmaxcodec indicator''' 227 return self.maxcodec - self.lencodec if self.iscomplete else None 228 229 @property 230 def rancodec(self): 231 '''return integer rancodec indicator''' 232 return self.maxcodec - self.mincodec if self.iscomplete else None 233 234 @property 235 def typecodec(self): 236 '''return string typecodec indicator 237 (null, unique, complete, full, default, mixed) 238 ''' 239 if self.maxcodec is None or self.mincodec is None: 240 return None 241 if self.maxcodec == 0: 242 return NULL 243 if self.lencodec == 1: 244 return UNIQUE 245 if self.mincodec == self.maxcodec: 246 return COMPLETE 247 if self.lencodec == self.maxcodec: 248 return FULL 249 if self.lencodec == self.mincodec: 250 return DEFAULT 251 return MIXED 252 253 254class AnaRelation: 255 '''This class analyses relationship between two fields 256 257 *Attributes* : 258 259 - **relation** : List of the two fields involved in the relationship 260 - **dist** : value of the relationship 261 - **distrib** : boolean True if values are distributed 262 - **hashr**: integer - hash value to identify update 263 264 *global (@property)* 265 266 - `id_relation` 267 - `index_relation` 268 - `parent_child` 269 - `typecoupl` 270 271 *characteristic (@property)* 272 273 - `dmax` 274 - `dmin` 275 - `diff` 276 - `dran` 277 - `distomin` 278 - `distomax` 279 - `distance` 280 - `ratecpl` 281 - `rateder` 282 283 *instance methods* 284 285 - `to_dict` 286 ''' 287 288 def __init__(self, relation, dists, hashr=None): 289 '''Constructor of the relationship : 290 291 *Parameters* 292 293 - **relation** : List of the two fields involved in the relationship 294 - **dists** : dist value or list of dist value and distrib boolean 295 - **distrib** : boolean True if values are distributed 296 - **hashr**: integer - hash value to identify update 297 ''' 298 self.relation = relation 299 if isinstance(dists, list): 300 self.dist = dists[0] 301 self.distrib = dists[1] 302 else: 303 self.dist = dists 304 self.distrib = None 305 self.hashr = hashr 306 307 def __repr__(self): 308 '''representation of the field (class name + idfield)''' 309 return self.__class__.__name__ + '(' + str(self.id_relation) + ')' 310 311 def __str__(self): 312 '''json-text build with the attributes dict''' 313 return json.dumps(self.to_dict(relation=True)) 314 315 def __eq__(self, other): 316 ''' equal if class and values are equal''' 317 return self.__class__ .__name__ == other.__class__.__name__ and \ 318 self.relation == other.relation and self.dist == other.dist and \ 319 self.hashr == other.hashr and self.distrib == other.distrib 320 321 def __hash__(self): 322 '''return hash value (sum of attributes hash)''' 323 return hash(self.relation[0]) + hash(self.relation[1]) + \ 324 hash(self.dist) + hash(self.hashr) + hash(self.distrib) 325 326 def to_dict(self, distances=False, full=False, mode='field', relation=False, 327 notnone=True, misc=False): 328 '''return a dict with AnaRelation attributes. 329 330 *Parameters* 331 332 - **distances** : boolean (default False) - if True, distances indicators are included 333 - **full** : boolean (default False) - if True, all the attributes are included 334 - **relation** : boolean (default False) - if True, idfield are included 335 - **notnone** : boolean (default True) - if True, None values are not included 336 - **mode** : str (default 'field') - AnaDfield representation ('field', 'id', 'index') 337 ''' 338 dic = {DIST: self.dist, TYPECOUPL: self.typecoupl, HASHR: self.hashr} 339 if relation or full: 340 dic[RELATION] = Util.view(self.relation, mode) 341 #dic[TYPECOUPL] = self.typecoupl 342 dic[PARENTCHILD] = self.parent_child 343 if distances or full: 344 dic |= {DISTANCE: self.distance, DISTOMIN: self.distomin, 345 DISTOMAX: self.distomax, DISTRIBUTED: self.distrib, 346 RATECPL: self.ratecpl, RATEDER: self.rateder} 347 if misc or full: 348 dic |= {DMAX: self.dmax, DMIN: self.dmin, 349 DIFF: self.diff, DRAN: self.dran} 350 if notnone: 351 return Util.reduce_dic(dic) 352 return dic 353 354 @property 355 def id_relation(self): 356 '''return a list with the id of the two fields involved''' 357 if self.relation: 358 return [fld.idfield for fld in self.relation] 359 return [] 360 361 @property 362 def parent_child(self): 363 '''returns the direction of the relationship (True if parent is first)''' 364 rel0 = self.relation[0] 365 rel1 = self.relation[1] 366 # if isinstance(rel0, AnaDfield) and isinstance(rel1, AnaDfield): 367 return (rel0.lencodec > rel1.lencodec or 368 (rel0.lencodec == rel1.lencodec and rel0.index < rel1.index)) 369 # return None 370 371 @property 372 def index_relation(self): 373 '''return a list with the index of the two fields involved''' 374 if self.relation: 375 return [fld.index for fld in self.relation] 376 return [] 377 378 @property 379 def dmax(self): 380 '''return integer dmax indicator''' 381 return self.relation[0].lencodec * self.relation[1].lencodec 382 383 @property 384 def dmin(self): 385 '''return integer dmin indicator''' 386 return max(self.relation[0].lencodec, self.relation[1].lencodec) 387 388 @property 389 def diff(self): 390 '''return integer diff indicator''' 391 return abs(self.relation[0].lencodec - self.relation[1].lencodec) 392 393 @property 394 def dran(self): 395 '''return integer dran indicator''' 396 return self.dmax - self.dmin 397 398 @property 399 def distomin(self): 400 '''return integer distomin indicator''' 401 return self.dist - self.dmin 402 403 @property 404 def distomax(self): 405 '''return integer distomax indicator''' 406 return self.dmax - self.dist 407 408 @property 409 def distance(self): 410 '''return integer distance indicator''' 411 return self.distomin + self.diff 412 413 @property 414 def ratecpl(self): 415 '''return float ratecpl indicator''' 416 disdis = self.distance + self.distomax 417 return 0 if disdis == 0 else self.distance / disdis 418 419 @property 420 def rateder(self): 421 '''return float rateder indicator''' 422 return 0 if self.dran == 0 else self.distomin / self.dran 423 424 @property 425 def typecoupl(self): 426 '''return relationship type (coupled, derived, crossed, linked)''' 427 if self.distance == 0: 428 return COUPLED 429 if self.distomin == 0: 430 return DERIVED 431 if self.distomax == 0: 432 return CROSSED 433 return LINKED 434 435 436class AnaDfield(AnaField): 437 '''This class analyses structure and relationships of fields inside a dataset 438 439 *Attributes* : 440 441 - **dataset** : AnaDataset object where AnaDfield is included 442 - **AnaField attributes** : inheritance of AnaField object 443 444 *relationship (@property)* 445 446 - `list_relations` 447 - `list_p_derived` 448 - `list_c_derived` 449 - `list_coupled` 450 451 *field (@property)* 452 453 - `fields` 454 - `p_derived` 455 - `p_distance` 456 - `p_distomin` 457 458 *global (@property)* 459 460 - `index` 461 - `dist_root` 462 - `category` 463 464 *global (instance methods)* 465 466 - `ascendants` 467 - `to_dict` 468 - `view` 469 470 *other instance methods* 471 472 - `dic_inner_node` 473 ''' 474 def __new__(cls, other, dataset=None): 475 '''initialization of attributes from "other"''' 476 if isinstance(other, AnaDfield): 477 new = AnaDfield.__copy__(other) 478 return new 479 if isinstance(other, AnaField): 480 new = AnaField.__copy__(other) 481 new.__class__ = AnaDfield 482 return new 483 return object.__new__(cls) 484 485 def __init__(self, other, dataset): 486 '''AnaDfield is created by adding a AnaDataset link to an AnaField object. 487 488 *Parameters* 489 490 - **other** : AnaField or AnaDfield to initialize attributes 491 - **dataset** : AnaDataset which includes the AnaDfield 492 ''' 493 self.dataset = dataset 494 495 def __copy__(self): 496 ''' Copy all the data ''' 497 return self.__class__(AnaField(self), self.dataset) 498 499 def __lt__(self, other): 500 ''' return a comparison between field index''' 501 return self.index < other.index 502 503 @property 504 def index(self): 505 '''return the row of the field in the AnaDataset''' 506 if self == self.dataset.root: 507 return -1 508 return self.dataset.fields.index(self) 509 510 @property 511 def fields(self): 512 '''return the list of the fields included in the AnaDataset''' 513 return self.dataset.fields 514 515 @property 516 def list_relations(self): 517 '''return the list of the relations with the AnaDfield''' 518 return list(self.dataset.relations[self].values()) 519 520 @property 521 def list_p_derived(self): 522 '''return the list of the derived relations with the parents of AnaDfield''' 523 return [rel for rel in self.list_relations if rel.typecoupl == DERIVED 524 and not rel.parent_child] 525 526 @property 527 def list_c_derived(self): 528 '''return the list of the derived relations with the childs of AnaDfield''' 529 return [rel for rel in self.list_relations if rel.typecoupl == DERIVED 530 and rel.parent_child 531 and rel.relation[1].category != UNIQUE] 532 533 @property 534 def list_coupled(self): 535 '''return the list of the coupled relations with the AnaDfield''' 536 return [rel for rel in self.list_relations if rel.typecoupl == COUPLED] 537 538 @property 539 def dist_root(self): 540 '''return the distance to the root field''' 541 return len(self.dataset) - self.lencodec 542 543 @property 544 def category(self): 545 '''return AnaDfield category (unique, rooted, coupled, derived, mixed)''' 546 if self.typecodec == UNIQUE: 547 return UNIQUE 548 if self.typecodec in (COMPLETE, FULL): 549 return ROOTED 550 if COUPLED in [rel.typecoupl for rel in self.list_relations 551 if not rel.parent_child]: 552 return COUPLED 553 if not self.list_c_derived: 554 return DERIVED 555 return MIXED 556 557 @property 558 def p_derived(self): 559 '''return the first derived or coupled parent of the AnaDfield''' 560 if self.category in (UNIQUE, ROOTED): 561 return self.dataset.root 562 if self.category == COUPLED: 563 return [rel.relation[1] for rel in self.list_coupled 564 if not rel.relation[1].category == COUPLED][0] 565 if not self.list_p_derived: 566 return self.dataset.root 567 distance_min = min(rel.distance for rel in self.list_p_derived) 568 for rel in self.list_p_derived: 569 if rel.distance == distance_min: 570 if rel.relation[1].category == ROOTED: 571 return self.dataset.root 572 if rel.relation[1].category == MIXED: 573 return rel.relation[1] 574 return self.dataset.root 575 576 @property 577 def p_distance(self): 578 '''return the first parent with minimal distance of the AnaDfield''' 579 return self._p_min_dist() 580 581 @property 582 def p_distomin(self): 583 '''return the first parent with minimal distomin of the AnaDfield''' 584 return self._p_min_dist(False) 585 586 def _p_min_dist(self, distance=True): 587 '''return the parent with minimal distance of the AnaDfield''' 588 if self.category == UNIQUE: 589 return self.dataset.root 590 if distance: 591 dist_up = [rel.distance for rel in self.list_relations if 592 not rel.parent_child] 593 # not rel.parent_child and rel.relation[1].category != COUPLED] 594 else: 595 dist_up = [rel.distomin for rel in self.list_relations if 596 not rel.parent_child] 597 # not rel.parent_child and rel.relation[1].category != COUPLED] 598 if not dist_up or min(dist_up) == self.dist_root: 599 return self.dataset.root 600 dist_min = min(dist_up) 601 if distance: 602 list_dmin = [rel.relation[1] for rel in self.list_relations 603 if rel.distance == dist_min] 604 # if rel.distance == dist_min and not rel.parent_child] 605 else: 606 list_dmin = [rel.relation[1] for rel in self.list_relations 607 if rel.distomin == dist_min] 608 # if rel.distomin == dist_min and not rel.parent_child] 609 max_lencodec = max(fld.lencodec for fld in list_dmin) 610 return [fld for fld in list_dmin if fld.lencodec == max_lencodec][0] 611 612 def to_dict(self, mode='id'): 613 '''return a dict with field attributes. 614 615 *Parameters* 616 617 - **mode** : str (default 'id') - AnaDfield representation ('field', 'id', 'index') 618 ''' 619 dic = super().to_dict(full=True, notnone=False) 620 dic[DISTROOT] = self.dist_root 621 dic[NUM] = self.index 622 dic[CATEGORY] = self.category 623 dic[PDISTANCE] = self.p_distance.view(mode) 624 dic[PDISTOMIN] = self.p_distomin.view(mode) 625 dic[PDERIVED] = self.p_derived.view(mode) 626 return dic 627 628 def view(self, mode='field'): 629 ''' return a representation of the AnaDfield 630 631 *Parameters* 632 633 - **mode** : str (default 'field') - AnaDfield representation ('field', 'id', 'index') 634 ''' 635 return Util.view(self, mode) 636 637 def ascendants(self, typeparent='derived', mode='field'): 638 ''' return the list of the AnaDfield's ascendants in the family tree up to 639 the root AnaDfield. 640 641 *Parameters* 642 643 - **typeparent** : str (default 'derived') - 'derived', 'distance' or 'distomin' 644 - **mode** : str (default 'field') - AnaDfield representation 645 ('field', 'id', 'index') 646 647 *Returns* : list of parents from closest to the most distant. Parents 648 are represented with index, idfield, or object 649 ''' 650 parent = self 651 listparent = [] 652 while parent != self.dataset.root: 653 if typeparent == 'derived': 654 parent = parent.p_derived 655 elif typeparent == 'distance': 656 parent = parent.p_distance 657 else: 658 parent = parent.p_distomin 659 if parent != self.dataset.root: 660 listparent.append(parent) 661 return Util.view(listparent, mode) 662 663 def dic_inner_node(self, mode, lname): 664 '''return a child AnaDfield tree. 665 666 *Parameters* 667 668 - **lname** : integer - maximal length of the names 669 - **mode** : string (default 'derived') - kind of tree : 670 'derived' : derived tree 671 'distance': min distance tree 672 'distomin': min distomin tree 673 674 *Returns* : dict where key is a AnaDfield and value is the list of 675 the childs. 676 ''' 677 adding = '' 678 if mode == 'distance': 679 rel_parent = self.dataset.get_relation(self, self.p_distance) 680 adding = str(rel_parent.distance) + ' - ' 681 elif mode == 'distomin': 682 rel_parent = self.dataset.get_relation(self, self.p_distomin) 683 adding = str(rel_parent.distomin) + ' - ' 684 elif mode == 'derived': 685 rel_parent = self.dataset.get_relation(self, self.p_derived) 686 adding = str(rel_parent.distance) + ' - ' 687 adding += str(self.lencodec) 688 name = str(self.idfield)[:lname] + ' (' + adding + ')' 689 lis = [name.replace(' ', '*').replace("'", '*')] 690 if mode == 'derived': 691 childs = [] 692 #if not self.category in (ROOTED, COUPLED): 693 if not self.category in (ROOTED, COUPLED, UNIQUE): 694 for rel in self.list_coupled: 695 lis.append(rel.relation[1].dic_inner_node(mode, lname)) 696 if not self.category in (ROOTED, UNIQUE): 697 childs = [rel.relation[1] for rel in self.list_relations 698 if rel.relation[1].p_derived == self and 699 rel.relation[1].category != COUPLED] 700 if mode == 'distomin': 701 childs = [rel.relation[1] for rel in self.list_relations 702 if rel.relation[1].p_distomin == self] 703 if mode == 'distance': 704 childs = [rel.relation[1] for rel in self.list_relations 705 if rel.relation[1].p_distance == self] 706 for fld in childs: 707 lis.append(fld.dic_inner_node(mode, lname)) 708 return {str(self.index).ljust(2, '*'): lis} 709 710 711class AnaDataset: 712 '''This class analyses the structure of a dataset. 713 714 *Attributes* : 715 716 - **iddataset** : string or integer - Id of the Dataset 717 - **fields** : list of the AnaDfields included 718 - **relations** : dict of the AnaRelations between two AnaDfields 719 - **hashd** : string - update identifier 720 721 *relationship (@property)* 722 723 - `ana_relations` 724 - `p_relations` 725 726 *field (@property)* 727 728 - `root` 729 - `primary` 730 - `secondary` 731 - `unique` 732 - `variable` 733 734 *global (@property)* 735 736 - `category` 737 - `complete` 738 - `dimension` 739 740 *update (instance methods)* 741 742 - `set_relations` 743 744 745 *access (instance methods)* 746 747 - `get_relation` 748 - `dfield` 749 750 *synthesis (instance methods)* 751 752 - `tree` 753 - `to_dict` 754 - `indicator` 755 - `partitions` 756 - `field_partition` 757 ''' 758 759 def __init__(self, fields=None, relations=None, iddataset=None, 760 leng=None, hashd=None): 761 '''Creation mode : 762 - single dict attribute where keys are attributes name, 763 - single AnaDataset attribute to make a copy 764 - multiple attributes 765 766 *Parameters (multiple attributes)* 767 768 - **idfield** : string or integer - Id of the Field 769 - **lencodec** : integer (default None) - length of the codec 770 - **mincodec** : integer (default None) - number of different values 771 - **maxcodec** : integer (default None) - length of the field 772 - **hashf** : string (default None) - update identifier 773 ''' 774 if isinstance(fields, AnaDataset): 775 self.iddataset = fields.iddataset 776 self.fields = fields.fields 777 self.relations = fields.relations 778 self.hashd = fields.hashd 779 return 780 if isinstance(fields, dict): 781 iddataset = fields.get(IDDATASET, None) 782 leng = fields.get(LENGTH, None) 783 relations = fields.get(RELATIONS, None) 784 hashd = fields.get(HASHD) 785 fields = fields.get(FIELDS, None) 786 self.iddataset = iddataset 787 self.fields = [AnaDfield(AnaField(field), self) 788 for field in fields] if fields else [] 789 if leng: 790 for fld in self.fields: 791 fld.maxcodec = leng 792 self.relations = {field: {} for field in self.fields} 793 if relations: 794 for fld, dic_relation in relations.items(): 795 self.set_relations(fld, dic_relation) 796 self.hashd = hashd 797 798 def __len__(self): 799 '''length of the AnaDataset (len of the AnaDfields included)''' 800 return max(len(fld) for fld in self.fields) 801 802 def __eq__(self, other): 803 ''' equal if class and values are equal''' 804 return self.__class__ .__name__ == other.__class__.__name__ and \ 805 self.fields == other.fields and self.relations == other.relations and \ 806 self.iddataset == other.iddataset and self.hashd == other.hashd 807 808 def __hash__(self): 809 '''return hash value (sum of attributes hash)''' 810 return hash(self.iddataset) + sum(hash(fld) for fld in self.fields) + \ 811 sum(hash(rel) for rel in self.relations) + hash(self.hashd) 812 813 @property 814 def category(self): 815 '''return a list of AnaDfield category (unique, rooted, coupled, derived, mixed)''' 816 return [fld.category for fld in self.fields] 817 818 @property 819 def ana_relations(self): 820 '''return the list of AnaRelation included''' 821 return [rel for fldrel in self.relations.values() for rel in fldrel.values()] 822 823 @property 824 def p_relations(self): 825 '''return the list of oriented AnaRelation (parent first, child second)''' 826 return [rel for rel in self.ana_relations if rel.parent_child] 827 828 @property 829 def root(self): 830 '''return the root AnaDfield''' 831 len_self = len(self) 832 return AnaDfield(AnaField(ROOT, len_self, len_self, len_self), self) 833 834 @property 835 def primary(self): 836 '''return the first partition of the partitions''' 837 part = self.partitions(distributed=True) 838 return part[0] if part else [] 839 840 @property 841 def complete(self): 842 '''return True if the dimension is not 0''' 843 return self.dimension > 0 844 845 @property 846 def dimension(self): 847 '''return the highest partition lenght''' 848 return len(self.primary) 849 850 @property 851 def secondary(self): 852 '''return the derived ou coupled fields from primary''' 853 secondary = [] 854 for field in self.primary: 855 self._add_child(field, secondary) 856 return [fld for fld in secondary if not fld in self.primary] 857 858 @property 859 def unique(self): 860 '''return the unique fields''' 861 return [fld for fld in self.fields if fld.category == UNIQUE] 862 863 @property 864 def variable(self): 865 '''return the variable fields''' 866 return [fld for fld in self.fields 867 if not fld in self.primary + self.secondary + self.unique] 868 869 def set_relations(self, field, dic_relations): 870 '''Add relations in the AnaDataset from a dict. 871 872 *Parameters* 873 874 - **field** : AnaDfield, AnaField or str (idfield) - first relation AnaDfield 875 - **dic_relations** : dict - key is the second relation AnaDfield and 876 value is the dist value or teh list [dist, distrib] 877 ''' 878 fld = self.dfield(field) 879 for other, dist in dic_relations.items(): 880 oth = self.dfield(other) 881 self.relations[fld][oth] = AnaRelation([fld, oth], dist) 882 self.relations[oth][fld] = AnaRelation([oth, fld], dist) 883 884 def get_relation(self, fld1, fld2): 885 '''Return AnaRelation between fld1 and fld2. 886 887 *Parameters* 888 889 - **fld1** : AnaDfield, AnaField, int or str (idfield) - first relation AnaDfield 890 - **fld2** : AnaDfield, AnaField, int or str (idfield) - second relation AnaDfield 891 ''' 892 fl1 = self.dfield(fld1) 893 fl2 = self.dfield(fld2) 894 if self.root in [fl1, fl2]: 895 return AnaRelation([fl1, fl2], len(self)) 896 return self.relations[self.dfield(fld1)][self.dfield(fld2)] 897 898 def dfield(self, fld): 899 '''return the AnaDfield matching with fld. Fld is str, int, AnaDfield or AnaField''' 900 if fld in (-1, ROOT): 901 return self.root 902 if isinstance(fld, AnaDfield): 903 return fld 904 if isinstance(fld, int): 905 return self.fields[fld] 906 if isinstance(fld, str): 907 if fld in [dfld.idfield for dfld in self.fields]: 908 return [dfld for dfld in self.fields if dfld.idfield == fld][0] 909 # return self.root 910 return None 911 return AnaDfield(fld, self) 912 913 def tree(self, mode='derived', width=5, lname=20, string=True): 914 '''return a string with a tree of derived Field. 915 916 *Parameters* 917 918 - **lname** : integer (default 20) - length of the names 919 - **width** : integer (default 5) - length of the lines 920 - **string** : boolean (default True) - if True return str else return dict 921 - **mode** : string (default 'derived') - kind of tree : 922 'derived' : derived tree 923 'distance': min distance tree 924 'distomin': min distomin tree 925 ''' 926 lis = ['root-' + mode + '*(' + str(len(self)) + ')'] 927 if mode == 'distance': 928 childs = [fld for fld in self.fields if fld.p_distance == self.root] 929 elif mode == 'distomin': 930 childs = [fld for fld in self.fields if fld.p_distomin == self.root] 931 elif mode == 'derived': 932 childs = [fld for fld in self.fields if fld.p_derived == self.root] 933 for fld in childs: 934 lis.append(fld.dic_inner_node(mode, lname)) 935 tree = {str(-1).ljust(2, '*'): lis} 936 if string: 937 tre = pprint.pformat(tree, indent=0, width=width) 938 tre = tre.replace('---', ' - ') 939 tre = tre.replace(' ', ' ') 940 tre = tre.replace('*', ' ') 941 for car in ["'", "\"", "{", "[", "]", "}", ","]: 942 tre = tre.replace(car, "") 943 return tre 944 return Util.clean_dic(tree, '*', ' ') 945 946 def to_dict(self, mode='field', keys=None, relations=False): 947 '''return a dict with fields attributes and optionaly relations attributes. 948 949 *Parameters* 950 951 - **mode** : str (default 'field') - AnaDfield representation 952 ('field', 'id', 'index') 953 - **relations** : boolean (default: False) - if False return a list of fields, 954 if True return a dict '{"fields": <list of fields>, "relations": <list of relations>}' 955 - **keys** : string, list or tuple - list of keys or single key to return 956 if 'all' or None, all keys are returned 957 if list, only keys in list are returned 958 if string, only values associated to the string(key) are returned''' 959 fields = Util.filter_dic([fld.to_dict(mode=mode) 960 for fld in self.fields], keys) 961 leng = len(self.fields) 962 if not relations: 963 return fields 964 return {'fields': fields, 'relations': 965 [self.get_relation(i, j).to_dict(full=True, mode=mode) 966 for i in range(-1, leng) for j in range(i + 1, leng)]} 967 968 def partitions(self, mode='field', distributed=True): 969 '''return a list of available partitions (the first is highest). 970 971 *Parameters* 972 973 - **mode** : str (default 'field') - AnaDfield representation 974 ('field', 'id', 'index') 975 - **distributed** : boolean (default True) - Include only distributed fields 976 ''' 977 partit = [[fld] for fld in self.fields if fld.category == ROOTED] 978 crossed = [rel for rel in self.ana_relations if rel.typecoupl == CROSSED 979 # and rel.relation[1].index > rel.relation[0].index 980 and rel.parent_child 981 and rel.relation[0].category != COUPLED 982 and rel.relation[1].category != COUPLED] 983 if distributed: 984 crossed = [rel for rel in crossed if rel.distrib] 985 if crossed and len(crossed) == 1 and crossed[0].dist == len(self): 986 partit.insert(0, crossed[0].relation) 987 elif crossed: 988 for repeat in list(range(len(crossed))): 989 candidates = combinations(crossed, repeat + 1) 990 for candidat in candidates: 991 flds = list(set(rel.relation[i] 992 for rel in candidat for i in [0, 1])) 993 if (reduce(mul, [fld.lencodec for fld in flds]) == len(self) and 994 len(candidat) == sum(range(len(flds))) and 995 (not distributed or min(rel.distrib for rel in candidat))): 996 partit.insert(0, flds) 997 partit = Util.view(partit, mode) 998 return [list(tup) for tup in 999 sorted(sorted(list({tuple(sorted(prt)) for prt in partit})), 1000 key=len, reverse=True)] 1001 1002 def field_partition(self, mode='field', partition=None, distributed=True): 1003 '''return a partition dict with the list of primary, secondary, unique 1004 and variable fields. 1005 1006 *Parameters* 1007 1008 - **mode** : str (default 'field') - AnaDfield representation 1009 ('field', 'id', 'index') 1010 - **partition** : list (default None) - if None, partition is the first 1011 - **distributed** : boolean (default True) - Include only distributed fields 1012 ''' 1013 if not partition: 1014 partitions = self.partitions(distributed=distributed) 1015 if not partitions: 1016 return {'primary': [], 'secondary': [], 'unique': [], 'variable': []} 1017 partition = partitions[0] 1018 else: 1019 partition = [self.dfield(fld) for fld in partition] 1020 secondary = [] 1021 for field in partition: 1022 self._add_child(field, secondary) 1023 secondary = [fld for fld in secondary if not fld in partition] 1024 unique = [fld for fld in self.fields if fld.category == UNIQUE] 1025 variable = [fld for fld in self.fields 1026 if not fld in partition + secondary + unique] 1027 return Util.view({'primary': partition, 'secondary': secondary, 1028 'unique': unique, 'variable': variable}, mode) 1029 1030 def indicator(self, fullsize, size): 1031 '''generate size indicators: ol (object lightness), ul (unicity level), 1032 gain (sizegain) 1033 1034 *Parameters* 1035 1036 - **fullsize** : int - size with full codec 1037 - **size** : int - size with existing codec 1038 1039 *Returns* : dict''' 1040 lenindex = len(self.fields) 1041 indexlen = sum(fld.lencodec for fld in self.fields) 1042 nval = len(self) * (lenindex + 1) 1043 sval = fullsize / nval 1044 ncod = indexlen + lenindex 1045 1046 if nval != ncod: 1047 scod = (size - ncod * sval) / (nval - ncod) 1048 olight = scod / sval 1049 else: 1050 olight = None 1051 return {'total values': nval, 'mean size': round(sval, 3), 1052 'unique values': ncod, 'mean coding size': round(scod, 3), 1053 'unicity level': round(ncod / nval, 3), 1054 'optimize level': round(size / fullsize, 3), 1055 'object lightness': round(olight, 3), 1056 'maxgain': round((nval - ncod) / nval, 3), 1057 'gain': round((fullsize - size) / fullsize, 3)} 1058 1059 def _add_child(self, field, childs): 1060 ''' add derived or coupled fields in the childs list''' 1061 for rel in field.list_c_derived + field.list_coupled: 1062 child = rel.relation[1] 1063 if not child in childs and not child.category == UNIQUE: 1064 childs.append(child) 1065 if not child.category in (COUPLED, UNIQUE): 1066 self._add_child(child, childs) 1067 1068 1069class Util: 1070 ''' common functions for analysis package''' 1071 1072 @staticmethod 1073 def view(field_struc, mode): 1074 ''' return a representation of a AnaDfields structure (fields, id, index). 1075 1076 *Parameters* 1077 1078 - **mode** : str - AnaDfield representation ('field', 'id', 'index') 1079 - **field_struc** : list or dict - structure to represent 1080 ''' 1081 if mode is None or mode == 'field' or not field_struc: 1082 return field_struc 1083 if isinstance(field_struc, dict): 1084 return {key: [fld.idfield if mode == 'id' else fld.index for fld in val] 1085 for key, val in field_struc.items()} 1086 if isinstance(field_struc, list) and isinstance(field_struc[0], list): 1087 return [[fld.idfield if mode == 'id' else fld.index for fld in val] 1088 for val in field_struc] 1089 if isinstance(field_struc, list): 1090 return [fld.idfield if mode == 'id' else fld.index for fld in field_struc] 1091 if isinstance(field_struc, AnaField): 1092 return field_struc.idfield if mode == 'id' else field_struc.index 1093 return field_struc 1094 1095 @staticmethod 1096 def reduce_dic(obj): 1097 '''return a dict without None values''' 1098 if isinstance(obj, dict): 1099 return {key: Util.reduce_dic(val) for key, val in obj.items() 1100 if not val is None} 1101 if isinstance(obj, list): 1102 return [Util.reduce_dic(val) for val in obj] 1103 return obj 1104 1105 @staticmethod 1106 def clean_dic(obj, old, new): 1107 '''return a dict or list with updated strings by replacing "old" substring 1108 with "new" substring''' 1109 if isinstance(obj, dict): 1110 return {Util.clean_dic(key, old, new): Util.clean_dic(val, old, new) 1111 for key, val in obj.items()} 1112 if isinstance(obj, str): 1113 return obj.replace(old, new) 1114 if isinstance(obj, list): 1115 return [Util.clean_dic(val, old, new) for val in obj] 1116 return obj 1117 1118 @staticmethod 1119 def filter_dic(obj, keys): 1120 '''return extract of a list of dict or of a dict 1121 1122 *Parameters* 1123 1124 - **keys** : string, list or tuple - list of keys or single key to return 1125 if 'all' or None, all keys are returned 1126 if list, only keys in list are returned 1127 if string, only values associated to the string(key) are returned''' 1128 if not keys or keys == 'all': 1129 return obj 1130 if isinstance(obj, list): 1131 return [Util.filter_dic(dic, keys) for dic in obj] 1132 if isinstance(keys, str) and isinstance(obj, dict): 1133 return obj.get(keys, None) 1134 if isinstance(keys, (list, tuple)) and isinstance(obj, dict): 1135 return {key: val for key, val in obj.items() if key in keys} 1136 return obj 1137 1138 1139class AnaError(Exception): 1140 ''' Analysis Exception''' 1141 # pass
79class AnaField: 80 '''This class analyses field entities. 81 82 *Attributes* 83 84 - **idfield** : string - name or Id of the field 85 - **lencodec**: integer - codec length 86 - **mincodec**: integer - minimal codec length 87 - **maxcodec**: integer - minimal codec length 88 - **hashf**: integer - hash value to identify modifications 89 90 *characteristic (@property)* 91 92 - `iscomplete` 93 - `ratecodec` 94 - `dmincodec` 95 - `dmaxcodec` 96 - `rancodec` 97 - `typecodec` 98 99 *instance methods* 100 101 - `to_dict` 102 103 ''' 104 105 def __init__(self, idfield, lencodec=None, mincodec=None, maxcodec=None, hashf=None): 106 '''Creation mode : 107 - single dict attribute where keys are attributes name, 108 - single AnaField attribute to make a copy 109 - multiple attributes 110 111 *Parameters (multiple attributes)* 112 113 - **idfield** : string or integer - Id of the Field 114 - **lencodec** : integer (default None) - length of the codec 115 - **mincodec** : integer (default None) - number of different values 116 - **maxcodec** : integer (default None) - length of the field 117 - **hashf** : string (default None) - update identifier 118 119 *example* 120 121 AnaField is created with a dict 122 >>> AnaField(Cfield([1,2,3,3]).to_analysis).to_dict() 123 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 124 >>> AnaField({'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}) 125 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 126 127 AnaField is created with parameters 128 >>> AnaField(lencodec=4, mincodec=3, maxcodec=4).to_dict() 129 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 130 >>> AnaField(4, 3, 4).to_dict() 131 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 132 ''' 133 if isinstance(idfield, dict): 134 self.idfield = idfield.get(IDFIELD, None) 135 self.lencodec = idfield.get(LENCODEC, None) 136 self.mincodec = idfield.get(MINCODEC, None) 137 self.maxcodec = idfield.get(MAXCODEC, None) 138 self.hashf = idfield.get(HASHF, None) 139 return 140 if isinstance(idfield, (AnaField, AnaDfield)): 141 self.idfield = idfield.idfield 142 self.lencodec = idfield.lencodec 143 self.mincodec = idfield.mincodec 144 self.maxcodec = idfield.maxcodec 145 self.hashf = idfield.hashf 146 return 147 if not lencodec or not isinstance(lencodec, int): 148 raise AnaError("lencodec is not correct") 149 self.idfield = idfield 150 self.lencodec = lencodec 151 self.mincodec = mincodec 152 self.maxcodec = maxcodec 153 self.hashf = hashf 154 155 def __len__(self): 156 '''length of the field (maxcodec)''' 157 return self.maxcodec if self.maxcodec else self.lencodec 158 159 def __repr__(self): 160 '''representation of the field (class name + idfield)''' 161 return self.__class__.__name__ + '(' + str(self.idfield) + ')' 162 163 def __eq__(self, other): 164 ''' equal if class and attributes are equal''' 165 return self.__class__ .__name__ == other.__class__.__name__ and \ 166 self.idfield == other.idfield and self.lencodec == other.lencodec and \ 167 self.mincodec == other.mincodec and self.maxcodec == other.maxcodec and \ 168 self.hashf == other.hashf 169 170 def __lt__(self, other): 171 ''' return a comparison between hash value''' 172 return hash(self) < hash(other) 173 174 def __hash__(self): 175 '''return hash value (sum of attributes hash)''' 176 return hash(self.idfield) + hash(self.lencodec) + hash(self.mincodec) \ 177 + hash(self.maxcodec) + hash(self.hashf) 178 179 def __str__(self): 180 '''json-text build with the attributes dict''' 181 return json.dumps(self.to_dict(idfield=True)) 182 183 def __copy__(self): 184 ''' Copy all the attributes ''' 185 return self.__class__(self) 186 187 def to_dict(self, full=False, idfield=False, notnone=True): 188 '''return a dict with field attributes. 189 190 *Parameters* 191 192 - **full** : boolean (default False) - if True, all the attributes are included 193 - **idfield** : boolean (default False) - if True, idfield is included 194 - **notnone** : boolean (default True) - if True, None values are not included 195 ''' 196 dic = {LENCODEC: self.lencodec, MINCODEC: self.mincodec, 197 MAXCODEC: self.maxcodec} 198 if idfield or full: 199 dic[IDFIELD] = self.idfield 200 if full: 201 dic |= {RATECODEC: self.ratecodec, DMINCODEC: self.dmincodec, 202 DMAXCODEC: self.dmaxcodec, RANCODEC: self.rancodec, 203 TYPECODEC: self.typecodec} 204 if notnone: 205 return Util.reduce_dic(dic) 206 return dic 207 208 @property 209 def iscomplete(self): 210 '''return boolean indicator : True if all attributes are present''' 211 return not self.maxcodec is None and not self.mincodec is None 212 213 @property 214 def ratecodec(self): 215 '''return float ratecodec indicator''' 216 if self.iscomplete and self.maxcodec - self.mincodec: 217 return (self.maxcodec - self.lencodec) / (self.maxcodec - self.mincodec) 218 return None 219 220 @property 221 def dmincodec(self): 222 '''return integer dmincodec indicator''' 223 return self.lencodec - self.mincodec if self.iscomplete else None 224 225 @property 226 def dmaxcodec(self): 227 '''return integer dmaxcodec indicator''' 228 return self.maxcodec - self.lencodec if self.iscomplete else None 229 230 @property 231 def rancodec(self): 232 '''return integer rancodec indicator''' 233 return self.maxcodec - self.mincodec if self.iscomplete else None 234 235 @property 236 def typecodec(self): 237 '''return string typecodec indicator 238 (null, unique, complete, full, default, mixed) 239 ''' 240 if self.maxcodec is None or self.mincodec is None: 241 return None 242 if self.maxcodec == 0: 243 return NULL 244 if self.lencodec == 1: 245 return UNIQUE 246 if self.mincodec == self.maxcodec: 247 return COMPLETE 248 if self.lencodec == self.maxcodec: 249 return FULL 250 if self.lencodec == self.mincodec: 251 return DEFAULT 252 return MIXED
This class analyses field entities.
Attributes
- idfield : string - name or Id of the field
- lencodec: integer - codec length
- mincodec: integer - minimal codec length
- maxcodec: integer - minimal codec length
- hashf: integer - hash value to identify modifications
characteristic (@property)
instance methods
105 def __init__(self, idfield, lencodec=None, mincodec=None, maxcodec=None, hashf=None): 106 '''Creation mode : 107 - single dict attribute where keys are attributes name, 108 - single AnaField attribute to make a copy 109 - multiple attributes 110 111 *Parameters (multiple attributes)* 112 113 - **idfield** : string or integer - Id of the Field 114 - **lencodec** : integer (default None) - length of the codec 115 - **mincodec** : integer (default None) - number of different values 116 - **maxcodec** : integer (default None) - length of the field 117 - **hashf** : string (default None) - update identifier 118 119 *example* 120 121 AnaField is created with a dict 122 >>> AnaField(Cfield([1,2,3,3]).to_analysis).to_dict() 123 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 124 >>> AnaField({'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}) 125 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 126 127 AnaField is created with parameters 128 >>> AnaField(lencodec=4, mincodec=3, maxcodec=4).to_dict() 129 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 130 >>> AnaField(4, 3, 4).to_dict() 131 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 132 ''' 133 if isinstance(idfield, dict): 134 self.idfield = idfield.get(IDFIELD, None) 135 self.lencodec = idfield.get(LENCODEC, None) 136 self.mincodec = idfield.get(MINCODEC, None) 137 self.maxcodec = idfield.get(MAXCODEC, None) 138 self.hashf = idfield.get(HASHF, None) 139 return 140 if isinstance(idfield, (AnaField, AnaDfield)): 141 self.idfield = idfield.idfield 142 self.lencodec = idfield.lencodec 143 self.mincodec = idfield.mincodec 144 self.maxcodec = idfield.maxcodec 145 self.hashf = idfield.hashf 146 return 147 if not lencodec or not isinstance(lencodec, int): 148 raise AnaError("lencodec is not correct") 149 self.idfield = idfield 150 self.lencodec = lencodec 151 self.mincodec = mincodec 152 self.maxcodec = maxcodec 153 self.hashf = hashf
Creation mode :
- single dict attribute where keys are attributes name,
- single AnaField attribute to make a copy
- multiple attributes
Parameters (multiple attributes)
- idfield : string or integer - Id of the Field
- lencodec : integer (default None) - length of the codec
- mincodec : integer (default None) - number of different values
- maxcodec : integer (default None) - length of the field
- hashf : string (default None) - update identifier
example
AnaField is created with a dict
>>> AnaField(Cfield([1,2,3,3]).to_analysis).to_dict()
{'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
>>> AnaField({'lencodec': 4, 'mincodec': 3, 'maxcodec': 4})
{'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
AnaField is created with parameters
>>> AnaField(lencodec=4, mincodec=3, maxcodec=4).to_dict()
{'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
>>> AnaField(4, 3, 4).to_dict()
{'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
187 def to_dict(self, full=False, idfield=False, notnone=True): 188 '''return a dict with field attributes. 189 190 *Parameters* 191 192 - **full** : boolean (default False) - if True, all the attributes are included 193 - **idfield** : boolean (default False) - if True, idfield is included 194 - **notnone** : boolean (default True) - if True, None values are not included 195 ''' 196 dic = {LENCODEC: self.lencodec, MINCODEC: self.mincodec, 197 MAXCODEC: self.maxcodec} 198 if idfield or full: 199 dic[IDFIELD] = self.idfield 200 if full: 201 dic |= {RATECODEC: self.ratecodec, DMINCODEC: self.dmincodec, 202 DMAXCODEC: self.dmaxcodec, RANCODEC: self.rancodec, 203 TYPECODEC: self.typecodec} 204 if notnone: 205 return Util.reduce_dic(dic) 206 return dic
return a dict with field attributes.
Parameters
- full : boolean (default False) - if True, all the attributes are included
- idfield : boolean (default False) - if True, idfield is included
- notnone : boolean (default True) - if True, None values are not included
255class AnaRelation: 256 '''This class analyses relationship between two fields 257 258 *Attributes* : 259 260 - **relation** : List of the two fields involved in the relationship 261 - **dist** : value of the relationship 262 - **distrib** : boolean True if values are distributed 263 - **hashr**: integer - hash value to identify update 264 265 *global (@property)* 266 267 - `id_relation` 268 - `index_relation` 269 - `parent_child` 270 - `typecoupl` 271 272 *characteristic (@property)* 273 274 - `dmax` 275 - `dmin` 276 - `diff` 277 - `dran` 278 - `distomin` 279 - `distomax` 280 - `distance` 281 - `ratecpl` 282 - `rateder` 283 284 *instance methods* 285 286 - `to_dict` 287 ''' 288 289 def __init__(self, relation, dists, hashr=None): 290 '''Constructor of the relationship : 291 292 *Parameters* 293 294 - **relation** : List of the two fields involved in the relationship 295 - **dists** : dist value or list of dist value and distrib boolean 296 - **distrib** : boolean True if values are distributed 297 - **hashr**: integer - hash value to identify update 298 ''' 299 self.relation = relation 300 if isinstance(dists, list): 301 self.dist = dists[0] 302 self.distrib = dists[1] 303 else: 304 self.dist = dists 305 self.distrib = None 306 self.hashr = hashr 307 308 def __repr__(self): 309 '''representation of the field (class name + idfield)''' 310 return self.__class__.__name__ + '(' + str(self.id_relation) + ')' 311 312 def __str__(self): 313 '''json-text build with the attributes dict''' 314 return json.dumps(self.to_dict(relation=True)) 315 316 def __eq__(self, other): 317 ''' equal if class and values are equal''' 318 return self.__class__ .__name__ == other.__class__.__name__ and \ 319 self.relation == other.relation and self.dist == other.dist and \ 320 self.hashr == other.hashr and self.distrib == other.distrib 321 322 def __hash__(self): 323 '''return hash value (sum of attributes hash)''' 324 return hash(self.relation[0]) + hash(self.relation[1]) + \ 325 hash(self.dist) + hash(self.hashr) + hash(self.distrib) 326 327 def to_dict(self, distances=False, full=False, mode='field', relation=False, 328 notnone=True, misc=False): 329 '''return a dict with AnaRelation attributes. 330 331 *Parameters* 332 333 - **distances** : boolean (default False) - if True, distances indicators are included 334 - **full** : boolean (default False) - if True, all the attributes are included 335 - **relation** : boolean (default False) - if True, idfield are included 336 - **notnone** : boolean (default True) - if True, None values are not included 337 - **mode** : str (default 'field') - AnaDfield representation ('field', 'id', 'index') 338 ''' 339 dic = {DIST: self.dist, TYPECOUPL: self.typecoupl, HASHR: self.hashr} 340 if relation or full: 341 dic[RELATION] = Util.view(self.relation, mode) 342 #dic[TYPECOUPL] = self.typecoupl 343 dic[PARENTCHILD] = self.parent_child 344 if distances or full: 345 dic |= {DISTANCE: self.distance, DISTOMIN: self.distomin, 346 DISTOMAX: self.distomax, DISTRIBUTED: self.distrib, 347 RATECPL: self.ratecpl, RATEDER: self.rateder} 348 if misc or full: 349 dic |= {DMAX: self.dmax, DMIN: self.dmin, 350 DIFF: self.diff, DRAN: self.dran} 351 if notnone: 352 return Util.reduce_dic(dic) 353 return dic 354 355 @property 356 def id_relation(self): 357 '''return a list with the id of the two fields involved''' 358 if self.relation: 359 return [fld.idfield for fld in self.relation] 360 return [] 361 362 @property 363 def parent_child(self): 364 '''returns the direction of the relationship (True if parent is first)''' 365 rel0 = self.relation[0] 366 rel1 = self.relation[1] 367 # if isinstance(rel0, AnaDfield) and isinstance(rel1, AnaDfield): 368 return (rel0.lencodec > rel1.lencodec or 369 (rel0.lencodec == rel1.lencodec and rel0.index < rel1.index)) 370 # return None 371 372 @property 373 def index_relation(self): 374 '''return a list with the index of the two fields involved''' 375 if self.relation: 376 return [fld.index for fld in self.relation] 377 return [] 378 379 @property 380 def dmax(self): 381 '''return integer dmax indicator''' 382 return self.relation[0].lencodec * self.relation[1].lencodec 383 384 @property 385 def dmin(self): 386 '''return integer dmin indicator''' 387 return max(self.relation[0].lencodec, self.relation[1].lencodec) 388 389 @property 390 def diff(self): 391 '''return integer diff indicator''' 392 return abs(self.relation[0].lencodec - self.relation[1].lencodec) 393 394 @property 395 def dran(self): 396 '''return integer dran indicator''' 397 return self.dmax - self.dmin 398 399 @property 400 def distomin(self): 401 '''return integer distomin indicator''' 402 return self.dist - self.dmin 403 404 @property 405 def distomax(self): 406 '''return integer distomax indicator''' 407 return self.dmax - self.dist 408 409 @property 410 def distance(self): 411 '''return integer distance indicator''' 412 return self.distomin + self.diff 413 414 @property 415 def ratecpl(self): 416 '''return float ratecpl indicator''' 417 disdis = self.distance + self.distomax 418 return 0 if disdis == 0 else self.distance / disdis 419 420 @property 421 def rateder(self): 422 '''return float rateder indicator''' 423 return 0 if self.dran == 0 else self.distomin / self.dran 424 425 @property 426 def typecoupl(self): 427 '''return relationship type (coupled, derived, crossed, linked)''' 428 if self.distance == 0: 429 return COUPLED 430 if self.distomin == 0: 431 return DERIVED 432 if self.distomax == 0: 433 return CROSSED 434 return LINKED
This class analyses relationship between two fields
Attributes :
- relation : List of the two fields involved in the relationship
- dist : value of the relationship
- distrib : boolean True if values are distributed
- hashr: integer - hash value to identify update
global (@property)
characteristic (@property)
instance methods
289 def __init__(self, relation, dists, hashr=None): 290 '''Constructor of the relationship : 291 292 *Parameters* 293 294 - **relation** : List of the two fields involved in the relationship 295 - **dists** : dist value or list of dist value and distrib boolean 296 - **distrib** : boolean True if values are distributed 297 - **hashr**: integer - hash value to identify update 298 ''' 299 self.relation = relation 300 if isinstance(dists, list): 301 self.dist = dists[0] 302 self.distrib = dists[1] 303 else: 304 self.dist = dists 305 self.distrib = None 306 self.hashr = hashr
Constructor of the relationship :
Parameters
- relation : List of the two fields involved in the relationship
- dists : dist value or list of dist value and distrib boolean
- distrib : boolean True if values are distributed
- hashr: integer - hash value to identify update
327 def to_dict(self, distances=False, full=False, mode='field', relation=False, 328 notnone=True, misc=False): 329 '''return a dict with AnaRelation attributes. 330 331 *Parameters* 332 333 - **distances** : boolean (default False) - if True, distances indicators are included 334 - **full** : boolean (default False) - if True, all the attributes are included 335 - **relation** : boolean (default False) - if True, idfield are included 336 - **notnone** : boolean (default True) - if True, None values are not included 337 - **mode** : str (default 'field') - AnaDfield representation ('field', 'id', 'index') 338 ''' 339 dic = {DIST: self.dist, TYPECOUPL: self.typecoupl, HASHR: self.hashr} 340 if relation or full: 341 dic[RELATION] = Util.view(self.relation, mode) 342 #dic[TYPECOUPL] = self.typecoupl 343 dic[PARENTCHILD] = self.parent_child 344 if distances or full: 345 dic |= {DISTANCE: self.distance, DISTOMIN: self.distomin, 346 DISTOMAX: self.distomax, DISTRIBUTED: self.distrib, 347 RATECPL: self.ratecpl, RATEDER: self.rateder} 348 if misc or full: 349 dic |= {DMAX: self.dmax, DMIN: self.dmin, 350 DIFF: self.diff, DRAN: self.dran} 351 if notnone: 352 return Util.reduce_dic(dic) 353 return dic
return a dict with AnaRelation attributes.
Parameters
- distances : boolean (default False) - if True, distances indicators are included
- full : boolean (default False) - if True, all the attributes are included
- relation : boolean (default False) - if True, idfield are included
- notnone : boolean (default True) - if True, None values are not included
- mode : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
437class AnaDfield(AnaField): 438 '''This class analyses structure and relationships of fields inside a dataset 439 440 *Attributes* : 441 442 - **dataset** : AnaDataset object where AnaDfield is included 443 - **AnaField attributes** : inheritance of AnaField object 444 445 *relationship (@property)* 446 447 - `list_relations` 448 - `list_p_derived` 449 - `list_c_derived` 450 - `list_coupled` 451 452 *field (@property)* 453 454 - `fields` 455 - `p_derived` 456 - `p_distance` 457 - `p_distomin` 458 459 *global (@property)* 460 461 - `index` 462 - `dist_root` 463 - `category` 464 465 *global (instance methods)* 466 467 - `ascendants` 468 - `to_dict` 469 - `view` 470 471 *other instance methods* 472 473 - `dic_inner_node` 474 ''' 475 def __new__(cls, other, dataset=None): 476 '''initialization of attributes from "other"''' 477 if isinstance(other, AnaDfield): 478 new = AnaDfield.__copy__(other) 479 return new 480 if isinstance(other, AnaField): 481 new = AnaField.__copy__(other) 482 new.__class__ = AnaDfield 483 return new 484 return object.__new__(cls) 485 486 def __init__(self, other, dataset): 487 '''AnaDfield is created by adding a AnaDataset link to an AnaField object. 488 489 *Parameters* 490 491 - **other** : AnaField or AnaDfield to initialize attributes 492 - **dataset** : AnaDataset which includes the AnaDfield 493 ''' 494 self.dataset = dataset 495 496 def __copy__(self): 497 ''' Copy all the data ''' 498 return self.__class__(AnaField(self), self.dataset) 499 500 def __lt__(self, other): 501 ''' return a comparison between field index''' 502 return self.index < other.index 503 504 @property 505 def index(self): 506 '''return the row of the field in the AnaDataset''' 507 if self == self.dataset.root: 508 return -1 509 return self.dataset.fields.index(self) 510 511 @property 512 def fields(self): 513 '''return the list of the fields included in the AnaDataset''' 514 return self.dataset.fields 515 516 @property 517 def list_relations(self): 518 '''return the list of the relations with the AnaDfield''' 519 return list(self.dataset.relations[self].values()) 520 521 @property 522 def list_p_derived(self): 523 '''return the list of the derived relations with the parents of AnaDfield''' 524 return [rel for rel in self.list_relations if rel.typecoupl == DERIVED 525 and not rel.parent_child] 526 527 @property 528 def list_c_derived(self): 529 '''return the list of the derived relations with the childs of AnaDfield''' 530 return [rel for rel in self.list_relations if rel.typecoupl == DERIVED 531 and rel.parent_child 532 and rel.relation[1].category != UNIQUE] 533 534 @property 535 def list_coupled(self): 536 '''return the list of the coupled relations with the AnaDfield''' 537 return [rel for rel in self.list_relations if rel.typecoupl == COUPLED] 538 539 @property 540 def dist_root(self): 541 '''return the distance to the root field''' 542 return len(self.dataset) - self.lencodec 543 544 @property 545 def category(self): 546 '''return AnaDfield category (unique, rooted, coupled, derived, mixed)''' 547 if self.typecodec == UNIQUE: 548 return UNIQUE 549 if self.typecodec in (COMPLETE, FULL): 550 return ROOTED 551 if COUPLED in [rel.typecoupl for rel in self.list_relations 552 if not rel.parent_child]: 553 return COUPLED 554 if not self.list_c_derived: 555 return DERIVED 556 return MIXED 557 558 @property 559 def p_derived(self): 560 '''return the first derived or coupled parent of the AnaDfield''' 561 if self.category in (UNIQUE, ROOTED): 562 return self.dataset.root 563 if self.category == COUPLED: 564 return [rel.relation[1] for rel in self.list_coupled 565 if not rel.relation[1].category == COUPLED][0] 566 if not self.list_p_derived: 567 return self.dataset.root 568 distance_min = min(rel.distance for rel in self.list_p_derived) 569 for rel in self.list_p_derived: 570 if rel.distance == distance_min: 571 if rel.relation[1].category == ROOTED: 572 return self.dataset.root 573 if rel.relation[1].category == MIXED: 574 return rel.relation[1] 575 return self.dataset.root 576 577 @property 578 def p_distance(self): 579 '''return the first parent with minimal distance of the AnaDfield''' 580 return self._p_min_dist() 581 582 @property 583 def p_distomin(self): 584 '''return the first parent with minimal distomin of the AnaDfield''' 585 return self._p_min_dist(False) 586 587 def _p_min_dist(self, distance=True): 588 '''return the parent with minimal distance of the AnaDfield''' 589 if self.category == UNIQUE: 590 return self.dataset.root 591 if distance: 592 dist_up = [rel.distance for rel in self.list_relations if 593 not rel.parent_child] 594 # not rel.parent_child and rel.relation[1].category != COUPLED] 595 else: 596 dist_up = [rel.distomin for rel in self.list_relations if 597 not rel.parent_child] 598 # not rel.parent_child and rel.relation[1].category != COUPLED] 599 if not dist_up or min(dist_up) == self.dist_root: 600 return self.dataset.root 601 dist_min = min(dist_up) 602 if distance: 603 list_dmin = [rel.relation[1] for rel in self.list_relations 604 if rel.distance == dist_min] 605 # if rel.distance == dist_min and not rel.parent_child] 606 else: 607 list_dmin = [rel.relation[1] for rel in self.list_relations 608 if rel.distomin == dist_min] 609 # if rel.distomin == dist_min and not rel.parent_child] 610 max_lencodec = max(fld.lencodec for fld in list_dmin) 611 return [fld for fld in list_dmin if fld.lencodec == max_lencodec][0] 612 613 def to_dict(self, mode='id'): 614 '''return a dict with field attributes. 615 616 *Parameters* 617 618 - **mode** : str (default 'id') - AnaDfield representation ('field', 'id', 'index') 619 ''' 620 dic = super().to_dict(full=True, notnone=False) 621 dic[DISTROOT] = self.dist_root 622 dic[NUM] = self.index 623 dic[CATEGORY] = self.category 624 dic[PDISTANCE] = self.p_distance.view(mode) 625 dic[PDISTOMIN] = self.p_distomin.view(mode) 626 dic[PDERIVED] = self.p_derived.view(mode) 627 return dic 628 629 def view(self, mode='field'): 630 ''' return a representation of the AnaDfield 631 632 *Parameters* 633 634 - **mode** : str (default 'field') - AnaDfield representation ('field', 'id', 'index') 635 ''' 636 return Util.view(self, mode) 637 638 def ascendants(self, typeparent='derived', mode='field'): 639 ''' return the list of the AnaDfield's ascendants in the family tree up to 640 the root AnaDfield. 641 642 *Parameters* 643 644 - **typeparent** : str (default 'derived') - 'derived', 'distance' or 'distomin' 645 - **mode** : str (default 'field') - AnaDfield representation 646 ('field', 'id', 'index') 647 648 *Returns* : list of parents from closest to the most distant. Parents 649 are represented with index, idfield, or object 650 ''' 651 parent = self 652 listparent = [] 653 while parent != self.dataset.root: 654 if typeparent == 'derived': 655 parent = parent.p_derived 656 elif typeparent == 'distance': 657 parent = parent.p_distance 658 else: 659 parent = parent.p_distomin 660 if parent != self.dataset.root: 661 listparent.append(parent) 662 return Util.view(listparent, mode) 663 664 def dic_inner_node(self, mode, lname): 665 '''return a child AnaDfield tree. 666 667 *Parameters* 668 669 - **lname** : integer - maximal length of the names 670 - **mode** : string (default 'derived') - kind of tree : 671 'derived' : derived tree 672 'distance': min distance tree 673 'distomin': min distomin tree 674 675 *Returns* : dict where key is a AnaDfield and value is the list of 676 the childs. 677 ''' 678 adding = '' 679 if mode == 'distance': 680 rel_parent = self.dataset.get_relation(self, self.p_distance) 681 adding = str(rel_parent.distance) + ' - ' 682 elif mode == 'distomin': 683 rel_parent = self.dataset.get_relation(self, self.p_distomin) 684 adding = str(rel_parent.distomin) + ' - ' 685 elif mode == 'derived': 686 rel_parent = self.dataset.get_relation(self, self.p_derived) 687 adding = str(rel_parent.distance) + ' - ' 688 adding += str(self.lencodec) 689 name = str(self.idfield)[:lname] + ' (' + adding + ')' 690 lis = [name.replace(' ', '*').replace("'", '*')] 691 if mode == 'derived': 692 childs = [] 693 #if not self.category in (ROOTED, COUPLED): 694 if not self.category in (ROOTED, COUPLED, UNIQUE): 695 for rel in self.list_coupled: 696 lis.append(rel.relation[1].dic_inner_node(mode, lname)) 697 if not self.category in (ROOTED, UNIQUE): 698 childs = [rel.relation[1] for rel in self.list_relations 699 if rel.relation[1].p_derived == self and 700 rel.relation[1].category != COUPLED] 701 if mode == 'distomin': 702 childs = [rel.relation[1] for rel in self.list_relations 703 if rel.relation[1].p_distomin == self] 704 if mode == 'distance': 705 childs = [rel.relation[1] for rel in self.list_relations 706 if rel.relation[1].p_distance == self] 707 for fld in childs: 708 lis.append(fld.dic_inner_node(mode, lname)) 709 return {str(self.index).ljust(2, '*'): lis}
This class analyses structure and relationships of fields inside a dataset
Attributes :
- dataset : AnaDataset object where AnaDfield is included
- AnaField attributes : inheritance of AnaField object
relationship (@property)
field (@property)
global (@property)
global (instance methods)
other instance methods
486 def __init__(self, other, dataset): 487 '''AnaDfield is created by adding a AnaDataset link to an AnaField object. 488 489 *Parameters* 490 491 - **other** : AnaField or AnaDfield to initialize attributes 492 - **dataset** : AnaDataset which includes the AnaDfield 493 ''' 494 self.dataset = dataset
AnaDfield is created by adding a AnaDataset link to an AnaField object.
Parameters
- other : AnaField or AnaDfield to initialize attributes
- dataset : AnaDataset which includes the AnaDfield
613 def to_dict(self, mode='id'): 614 '''return a dict with field attributes. 615 616 *Parameters* 617 618 - **mode** : str (default 'id') - AnaDfield representation ('field', 'id', 'index') 619 ''' 620 dic = super().to_dict(full=True, notnone=False) 621 dic[DISTROOT] = self.dist_root 622 dic[NUM] = self.index 623 dic[CATEGORY] = self.category 624 dic[PDISTANCE] = self.p_distance.view(mode) 625 dic[PDISTOMIN] = self.p_distomin.view(mode) 626 dic[PDERIVED] = self.p_derived.view(mode) 627 return dic
return a dict with field attributes.
Parameters
- mode : str (default 'id') - AnaDfield representation ('field', 'id', 'index')
629 def view(self, mode='field'): 630 ''' return a representation of the AnaDfield 631 632 *Parameters* 633 634 - **mode** : str (default 'field') - AnaDfield representation ('field', 'id', 'index') 635 ''' 636 return Util.view(self, mode)
return a representation of the AnaDfield
Parameters
- mode : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
638 def ascendants(self, typeparent='derived', mode='field'): 639 ''' return the list of the AnaDfield's ascendants in the family tree up to 640 the root AnaDfield. 641 642 *Parameters* 643 644 - **typeparent** : str (default 'derived') - 'derived', 'distance' or 'distomin' 645 - **mode** : str (default 'field') - AnaDfield representation 646 ('field', 'id', 'index') 647 648 *Returns* : list of parents from closest to the most distant. Parents 649 are represented with index, idfield, or object 650 ''' 651 parent = self 652 listparent = [] 653 while parent != self.dataset.root: 654 if typeparent == 'derived': 655 parent = parent.p_derived 656 elif typeparent == 'distance': 657 parent = parent.p_distance 658 else: 659 parent = parent.p_distomin 660 if parent != self.dataset.root: 661 listparent.append(parent) 662 return Util.view(listparent, mode)
return the list of the AnaDfield's ascendants in the family tree up to the root AnaDfield.
Parameters
- typeparent : str (default 'derived') - 'derived', 'distance' or 'distomin'
- mode : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
Returns : list of parents from closest to the most distant. Parents are represented with index, idfield, or object
664 def dic_inner_node(self, mode, lname): 665 '''return a child AnaDfield tree. 666 667 *Parameters* 668 669 - **lname** : integer - maximal length of the names 670 - **mode** : string (default 'derived') - kind of tree : 671 'derived' : derived tree 672 'distance': min distance tree 673 'distomin': min distomin tree 674 675 *Returns* : dict where key is a AnaDfield and value is the list of 676 the childs. 677 ''' 678 adding = '' 679 if mode == 'distance': 680 rel_parent = self.dataset.get_relation(self, self.p_distance) 681 adding = str(rel_parent.distance) + ' - ' 682 elif mode == 'distomin': 683 rel_parent = self.dataset.get_relation(self, self.p_distomin) 684 adding = str(rel_parent.distomin) + ' - ' 685 elif mode == 'derived': 686 rel_parent = self.dataset.get_relation(self, self.p_derived) 687 adding = str(rel_parent.distance) + ' - ' 688 adding += str(self.lencodec) 689 name = str(self.idfield)[:lname] + ' (' + adding + ')' 690 lis = [name.replace(' ', '*').replace("'", '*')] 691 if mode == 'derived': 692 childs = [] 693 #if not self.category in (ROOTED, COUPLED): 694 if not self.category in (ROOTED, COUPLED, UNIQUE): 695 for rel in self.list_coupled: 696 lis.append(rel.relation[1].dic_inner_node(mode, lname)) 697 if not self.category in (ROOTED, UNIQUE): 698 childs = [rel.relation[1] for rel in self.list_relations 699 if rel.relation[1].p_derived == self and 700 rel.relation[1].category != COUPLED] 701 if mode == 'distomin': 702 childs = [rel.relation[1] for rel in self.list_relations 703 if rel.relation[1].p_distomin == self] 704 if mode == 'distance': 705 childs = [rel.relation[1] for rel in self.list_relations 706 if rel.relation[1].p_distance == self] 707 for fld in childs: 708 lis.append(fld.dic_inner_node(mode, lname)) 709 return {str(self.index).ljust(2, '*'): lis}
return a child AnaDfield tree.
Parameters
- lname : integer - maximal length of the names
- mode : string (default 'derived') - kind of tree : 'derived' : derived tree 'distance': min distance tree 'distomin': min distomin tree
Returns : dict where key is a AnaDfield and value is the list of the childs.
712class AnaDataset: 713 '''This class analyses the structure of a dataset. 714 715 *Attributes* : 716 717 - **iddataset** : string or integer - Id of the Dataset 718 - **fields** : list of the AnaDfields included 719 - **relations** : dict of the AnaRelations between two AnaDfields 720 - **hashd** : string - update identifier 721 722 *relationship (@property)* 723 724 - `ana_relations` 725 - `p_relations` 726 727 *field (@property)* 728 729 - `root` 730 - `primary` 731 - `secondary` 732 - `unique` 733 - `variable` 734 735 *global (@property)* 736 737 - `category` 738 - `complete` 739 - `dimension` 740 741 *update (instance methods)* 742 743 - `set_relations` 744 745 746 *access (instance methods)* 747 748 - `get_relation` 749 - `dfield` 750 751 *synthesis (instance methods)* 752 753 - `tree` 754 - `to_dict` 755 - `indicator` 756 - `partitions` 757 - `field_partition` 758 ''' 759 760 def __init__(self, fields=None, relations=None, iddataset=None, 761 leng=None, hashd=None): 762 '''Creation mode : 763 - single dict attribute where keys are attributes name, 764 - single AnaDataset attribute to make a copy 765 - multiple attributes 766 767 *Parameters (multiple attributes)* 768 769 - **idfield** : string or integer - Id of the Field 770 - **lencodec** : integer (default None) - length of the codec 771 - **mincodec** : integer (default None) - number of different values 772 - **maxcodec** : integer (default None) - length of the field 773 - **hashf** : string (default None) - update identifier 774 ''' 775 if isinstance(fields, AnaDataset): 776 self.iddataset = fields.iddataset 777 self.fields = fields.fields 778 self.relations = fields.relations 779 self.hashd = fields.hashd 780 return 781 if isinstance(fields, dict): 782 iddataset = fields.get(IDDATASET, None) 783 leng = fields.get(LENGTH, None) 784 relations = fields.get(RELATIONS, None) 785 hashd = fields.get(HASHD) 786 fields = fields.get(FIELDS, None) 787 self.iddataset = iddataset 788 self.fields = [AnaDfield(AnaField(field), self) 789 for field in fields] if fields else [] 790 if leng: 791 for fld in self.fields: 792 fld.maxcodec = leng 793 self.relations = {field: {} for field in self.fields} 794 if relations: 795 for fld, dic_relation in relations.items(): 796 self.set_relations(fld, dic_relation) 797 self.hashd = hashd 798 799 def __len__(self): 800 '''length of the AnaDataset (len of the AnaDfields included)''' 801 return max(len(fld) for fld in self.fields) 802 803 def __eq__(self, other): 804 ''' equal if class and values are equal''' 805 return self.__class__ .__name__ == other.__class__.__name__ and \ 806 self.fields == other.fields and self.relations == other.relations and \ 807 self.iddataset == other.iddataset and self.hashd == other.hashd 808 809 def __hash__(self): 810 '''return hash value (sum of attributes hash)''' 811 return hash(self.iddataset) + sum(hash(fld) for fld in self.fields) + \ 812 sum(hash(rel) for rel in self.relations) + hash(self.hashd) 813 814 @property 815 def category(self): 816 '''return a list of AnaDfield category (unique, rooted, coupled, derived, mixed)''' 817 return [fld.category for fld in self.fields] 818 819 @property 820 def ana_relations(self): 821 '''return the list of AnaRelation included''' 822 return [rel for fldrel in self.relations.values() for rel in fldrel.values()] 823 824 @property 825 def p_relations(self): 826 '''return the list of oriented AnaRelation (parent first, child second)''' 827 return [rel for rel in self.ana_relations if rel.parent_child] 828 829 @property 830 def root(self): 831 '''return the root AnaDfield''' 832 len_self = len(self) 833 return AnaDfield(AnaField(ROOT, len_self, len_self, len_self), self) 834 835 @property 836 def primary(self): 837 '''return the first partition of the partitions''' 838 part = self.partitions(distributed=True) 839 return part[0] if part else [] 840 841 @property 842 def complete(self): 843 '''return True if the dimension is not 0''' 844 return self.dimension > 0 845 846 @property 847 def dimension(self): 848 '''return the highest partition lenght''' 849 return len(self.primary) 850 851 @property 852 def secondary(self): 853 '''return the derived ou coupled fields from primary''' 854 secondary = [] 855 for field in self.primary: 856 self._add_child(field, secondary) 857 return [fld for fld in secondary if not fld in self.primary] 858 859 @property 860 def unique(self): 861 '''return the unique fields''' 862 return [fld for fld in self.fields if fld.category == UNIQUE] 863 864 @property 865 def variable(self): 866 '''return the variable fields''' 867 return [fld for fld in self.fields 868 if not fld in self.primary + self.secondary + self.unique] 869 870 def set_relations(self, field, dic_relations): 871 '''Add relations in the AnaDataset from a dict. 872 873 *Parameters* 874 875 - **field** : AnaDfield, AnaField or str (idfield) - first relation AnaDfield 876 - **dic_relations** : dict - key is the second relation AnaDfield and 877 value is the dist value or teh list [dist, distrib] 878 ''' 879 fld = self.dfield(field) 880 for other, dist in dic_relations.items(): 881 oth = self.dfield(other) 882 self.relations[fld][oth] = AnaRelation([fld, oth], dist) 883 self.relations[oth][fld] = AnaRelation([oth, fld], dist) 884 885 def get_relation(self, fld1, fld2): 886 '''Return AnaRelation between fld1 and fld2. 887 888 *Parameters* 889 890 - **fld1** : AnaDfield, AnaField, int or str (idfield) - first relation AnaDfield 891 - **fld2** : AnaDfield, AnaField, int or str (idfield) - second relation AnaDfield 892 ''' 893 fl1 = self.dfield(fld1) 894 fl2 = self.dfield(fld2) 895 if self.root in [fl1, fl2]: 896 return AnaRelation([fl1, fl2], len(self)) 897 return self.relations[self.dfield(fld1)][self.dfield(fld2)] 898 899 def dfield(self, fld): 900 '''return the AnaDfield matching with fld. Fld is str, int, AnaDfield or AnaField''' 901 if fld in (-1, ROOT): 902 return self.root 903 if isinstance(fld, AnaDfield): 904 return fld 905 if isinstance(fld, int): 906 return self.fields[fld] 907 if isinstance(fld, str): 908 if fld in [dfld.idfield for dfld in self.fields]: 909 return [dfld for dfld in self.fields if dfld.idfield == fld][0] 910 # return self.root 911 return None 912 return AnaDfield(fld, self) 913 914 def tree(self, mode='derived', width=5, lname=20, string=True): 915 '''return a string with a tree of derived Field. 916 917 *Parameters* 918 919 - **lname** : integer (default 20) - length of the names 920 - **width** : integer (default 5) - length of the lines 921 - **string** : boolean (default True) - if True return str else return dict 922 - **mode** : string (default 'derived') - kind of tree : 923 'derived' : derived tree 924 'distance': min distance tree 925 'distomin': min distomin tree 926 ''' 927 lis = ['root-' + mode + '*(' + str(len(self)) + ')'] 928 if mode == 'distance': 929 childs = [fld for fld in self.fields if fld.p_distance == self.root] 930 elif mode == 'distomin': 931 childs = [fld for fld in self.fields if fld.p_distomin == self.root] 932 elif mode == 'derived': 933 childs = [fld for fld in self.fields if fld.p_derived == self.root] 934 for fld in childs: 935 lis.append(fld.dic_inner_node(mode, lname)) 936 tree = {str(-1).ljust(2, '*'): lis} 937 if string: 938 tre = pprint.pformat(tree, indent=0, width=width) 939 tre = tre.replace('---', ' - ') 940 tre = tre.replace(' ', ' ') 941 tre = tre.replace('*', ' ') 942 for car in ["'", "\"", "{", "[", "]", "}", ","]: 943 tre = tre.replace(car, "") 944 return tre 945 return Util.clean_dic(tree, '*', ' ') 946 947 def to_dict(self, mode='field', keys=None, relations=False): 948 '''return a dict with fields attributes and optionaly relations attributes. 949 950 *Parameters* 951 952 - **mode** : str (default 'field') - AnaDfield representation 953 ('field', 'id', 'index') 954 - **relations** : boolean (default: False) - if False return a list of fields, 955 if True return a dict '{"fields": <list of fields>, "relations": <list of relations>}' 956 - **keys** : string, list or tuple - list of keys or single key to return 957 if 'all' or None, all keys are returned 958 if list, only keys in list are returned 959 if string, only values associated to the string(key) are returned''' 960 fields = Util.filter_dic([fld.to_dict(mode=mode) 961 for fld in self.fields], keys) 962 leng = len(self.fields) 963 if not relations: 964 return fields 965 return {'fields': fields, 'relations': 966 [self.get_relation(i, j).to_dict(full=True, mode=mode) 967 for i in range(-1, leng) for j in range(i + 1, leng)]} 968 969 def partitions(self, mode='field', distributed=True): 970 '''return a list of available partitions (the first is highest). 971 972 *Parameters* 973 974 - **mode** : str (default 'field') - AnaDfield representation 975 ('field', 'id', 'index') 976 - **distributed** : boolean (default True) - Include only distributed fields 977 ''' 978 partit = [[fld] for fld in self.fields if fld.category == ROOTED] 979 crossed = [rel for rel in self.ana_relations if rel.typecoupl == CROSSED 980 # and rel.relation[1].index > rel.relation[0].index 981 and rel.parent_child 982 and rel.relation[0].category != COUPLED 983 and rel.relation[1].category != COUPLED] 984 if distributed: 985 crossed = [rel for rel in crossed if rel.distrib] 986 if crossed and len(crossed) == 1 and crossed[0].dist == len(self): 987 partit.insert(0, crossed[0].relation) 988 elif crossed: 989 for repeat in list(range(len(crossed))): 990 candidates = combinations(crossed, repeat + 1) 991 for candidat in candidates: 992 flds = list(set(rel.relation[i] 993 for rel in candidat for i in [0, 1])) 994 if (reduce(mul, [fld.lencodec for fld in flds]) == len(self) and 995 len(candidat) == sum(range(len(flds))) and 996 (not distributed or min(rel.distrib for rel in candidat))): 997 partit.insert(0, flds) 998 partit = Util.view(partit, mode) 999 return [list(tup) for tup in 1000 sorted(sorted(list({tuple(sorted(prt)) for prt in partit})), 1001 key=len, reverse=True)] 1002 1003 def field_partition(self, mode='field', partition=None, distributed=True): 1004 '''return a partition dict with the list of primary, secondary, unique 1005 and variable fields. 1006 1007 *Parameters* 1008 1009 - **mode** : str (default 'field') - AnaDfield representation 1010 ('field', 'id', 'index') 1011 - **partition** : list (default None) - if None, partition is the first 1012 - **distributed** : boolean (default True) - Include only distributed fields 1013 ''' 1014 if not partition: 1015 partitions = self.partitions(distributed=distributed) 1016 if not partitions: 1017 return {'primary': [], 'secondary': [], 'unique': [], 'variable': []} 1018 partition = partitions[0] 1019 else: 1020 partition = [self.dfield(fld) for fld in partition] 1021 secondary = [] 1022 for field in partition: 1023 self._add_child(field, secondary) 1024 secondary = [fld for fld in secondary if not fld in partition] 1025 unique = [fld for fld in self.fields if fld.category == UNIQUE] 1026 variable = [fld for fld in self.fields 1027 if not fld in partition + secondary + unique] 1028 return Util.view({'primary': partition, 'secondary': secondary, 1029 'unique': unique, 'variable': variable}, mode) 1030 1031 def indicator(self, fullsize, size): 1032 '''generate size indicators: ol (object lightness), ul (unicity level), 1033 gain (sizegain) 1034 1035 *Parameters* 1036 1037 - **fullsize** : int - size with full codec 1038 - **size** : int - size with existing codec 1039 1040 *Returns* : dict''' 1041 lenindex = len(self.fields) 1042 indexlen = sum(fld.lencodec for fld in self.fields) 1043 nval = len(self) * (lenindex + 1) 1044 sval = fullsize / nval 1045 ncod = indexlen + lenindex 1046 1047 if nval != ncod: 1048 scod = (size - ncod * sval) / (nval - ncod) 1049 olight = scod / sval 1050 else: 1051 olight = None 1052 return {'total values': nval, 'mean size': round(sval, 3), 1053 'unique values': ncod, 'mean coding size': round(scod, 3), 1054 'unicity level': round(ncod / nval, 3), 1055 'optimize level': round(size / fullsize, 3), 1056 'object lightness': round(olight, 3), 1057 'maxgain': round((nval - ncod) / nval, 3), 1058 'gain': round((fullsize - size) / fullsize, 3)} 1059 1060 def _add_child(self, field, childs): 1061 ''' add derived or coupled fields in the childs list''' 1062 for rel in field.list_c_derived + field.list_coupled: 1063 child = rel.relation[1] 1064 if not child in childs and not child.category == UNIQUE: 1065 childs.append(child) 1066 if not child.category in (COUPLED, UNIQUE): 1067 self._add_child(child, childs)
This class analyses the structure of a dataset.
Attributes :
- iddataset : string or integer - Id of the Dataset
- fields : list of the AnaDfields included
- relations : dict of the AnaRelations between two AnaDfields
- hashd : string - update identifier
relationship (@property)
field (@property)
global (@property)
update (instance methods)
access (instance methods)
synthesis (instance methods)
760 def __init__(self, fields=None, relations=None, iddataset=None, 761 leng=None, hashd=None): 762 '''Creation mode : 763 - single dict attribute where keys are attributes name, 764 - single AnaDataset attribute to make a copy 765 - multiple attributes 766 767 *Parameters (multiple attributes)* 768 769 - **idfield** : string or integer - Id of the Field 770 - **lencodec** : integer (default None) - length of the codec 771 - **mincodec** : integer (default None) - number of different values 772 - **maxcodec** : integer (default None) - length of the field 773 - **hashf** : string (default None) - update identifier 774 ''' 775 if isinstance(fields, AnaDataset): 776 self.iddataset = fields.iddataset 777 self.fields = fields.fields 778 self.relations = fields.relations 779 self.hashd = fields.hashd 780 return 781 if isinstance(fields, dict): 782 iddataset = fields.get(IDDATASET, None) 783 leng = fields.get(LENGTH, None) 784 relations = fields.get(RELATIONS, None) 785 hashd = fields.get(HASHD) 786 fields = fields.get(FIELDS, None) 787 self.iddataset = iddataset 788 self.fields = [AnaDfield(AnaField(field), self) 789 for field in fields] if fields else [] 790 if leng: 791 for fld in self.fields: 792 fld.maxcodec = leng 793 self.relations = {field: {} for field in self.fields} 794 if relations: 795 for fld, dic_relation in relations.items(): 796 self.set_relations(fld, dic_relation) 797 self.hashd = hashd
Creation mode :
- single dict attribute where keys are attributes name,
- single AnaDataset attribute to make a copy
- multiple attributes
Parameters (multiple attributes)
- idfield : string or integer - Id of the Field
- lencodec : integer (default None) - length of the codec
- mincodec : integer (default None) - number of different values
- maxcodec : integer (default None) - length of the field
- hashf : string (default None) - update identifier
870 def set_relations(self, field, dic_relations): 871 '''Add relations in the AnaDataset from a dict. 872 873 *Parameters* 874 875 - **field** : AnaDfield, AnaField or str (idfield) - first relation AnaDfield 876 - **dic_relations** : dict - key is the second relation AnaDfield and 877 value is the dist value or teh list [dist, distrib] 878 ''' 879 fld = self.dfield(field) 880 for other, dist in dic_relations.items(): 881 oth = self.dfield(other) 882 self.relations[fld][oth] = AnaRelation([fld, oth], dist) 883 self.relations[oth][fld] = AnaRelation([oth, fld], dist)
Add relations in the AnaDataset from a dict.
Parameters
- field : AnaDfield, AnaField or str (idfield) - first relation AnaDfield
- dic_relations : dict - key is the second relation AnaDfield and value is the dist value or teh list [dist, distrib]
885 def get_relation(self, fld1, fld2): 886 '''Return AnaRelation between fld1 and fld2. 887 888 *Parameters* 889 890 - **fld1** : AnaDfield, AnaField, int or str (idfield) - first relation AnaDfield 891 - **fld2** : AnaDfield, AnaField, int or str (idfield) - second relation AnaDfield 892 ''' 893 fl1 = self.dfield(fld1) 894 fl2 = self.dfield(fld2) 895 if self.root in [fl1, fl2]: 896 return AnaRelation([fl1, fl2], len(self)) 897 return self.relations[self.dfield(fld1)][self.dfield(fld2)]
Return AnaRelation between fld1 and fld2.
Parameters
- fld1 : AnaDfield, AnaField, int or str (idfield) - first relation AnaDfield
- fld2 : AnaDfield, AnaField, int or str (idfield) - second relation AnaDfield
899 def dfield(self, fld): 900 '''return the AnaDfield matching with fld. Fld is str, int, AnaDfield or AnaField''' 901 if fld in (-1, ROOT): 902 return self.root 903 if isinstance(fld, AnaDfield): 904 return fld 905 if isinstance(fld, int): 906 return self.fields[fld] 907 if isinstance(fld, str): 908 if fld in [dfld.idfield for dfld in self.fields]: 909 return [dfld for dfld in self.fields if dfld.idfield == fld][0] 910 # return self.root 911 return None 912 return AnaDfield(fld, self)
return the AnaDfield matching with fld. Fld is str, int, AnaDfield or AnaField
914 def tree(self, mode='derived', width=5, lname=20, string=True): 915 '''return a string with a tree of derived Field. 916 917 *Parameters* 918 919 - **lname** : integer (default 20) - length of the names 920 - **width** : integer (default 5) - length of the lines 921 - **string** : boolean (default True) - if True return str else return dict 922 - **mode** : string (default 'derived') - kind of tree : 923 'derived' : derived tree 924 'distance': min distance tree 925 'distomin': min distomin tree 926 ''' 927 lis = ['root-' + mode + '*(' + str(len(self)) + ')'] 928 if mode == 'distance': 929 childs = [fld for fld in self.fields if fld.p_distance == self.root] 930 elif mode == 'distomin': 931 childs = [fld for fld in self.fields if fld.p_distomin == self.root] 932 elif mode == 'derived': 933 childs = [fld for fld in self.fields if fld.p_derived == self.root] 934 for fld in childs: 935 lis.append(fld.dic_inner_node(mode, lname)) 936 tree = {str(-1).ljust(2, '*'): lis} 937 if string: 938 tre = pprint.pformat(tree, indent=0, width=width) 939 tre = tre.replace('---', ' - ') 940 tre = tre.replace(' ', ' ') 941 tre = tre.replace('*', ' ') 942 for car in ["'", "\"", "{", "[", "]", "}", ","]: 943 tre = tre.replace(car, "") 944 return tre 945 return Util.clean_dic(tree, '*', ' ')
return a string with a tree of derived Field.
Parameters
- lname : integer (default 20) - length of the names
- width : integer (default 5) - length of the lines
- string : boolean (default True) - if True return str else return dict
- mode : string (default 'derived') - kind of tree : 'derived' : derived tree 'distance': min distance tree 'distomin': min distomin tree
947 def to_dict(self, mode='field', keys=None, relations=False): 948 '''return a dict with fields attributes and optionaly relations attributes. 949 950 *Parameters* 951 952 - **mode** : str (default 'field') - AnaDfield representation 953 ('field', 'id', 'index') 954 - **relations** : boolean (default: False) - if False return a list of fields, 955 if True return a dict '{"fields": <list of fields>, "relations": <list of relations>}' 956 - **keys** : string, list or tuple - list of keys or single key to return 957 if 'all' or None, all keys are returned 958 if list, only keys in list are returned 959 if string, only values associated to the string(key) are returned''' 960 fields = Util.filter_dic([fld.to_dict(mode=mode) 961 for fld in self.fields], keys) 962 leng = len(self.fields) 963 if not relations: 964 return fields 965 return {'fields': fields, 'relations': 966 [self.get_relation(i, j).to_dict(full=True, mode=mode) 967 for i in range(-1, leng) for j in range(i + 1, leng)]}
return a dict with fields attributes and optionaly relations attributes.
Parameters
- mode : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
- relations : boolean (default: False) - if False return a list of fields,
if True return a dict '{"fields":
- , "relations":
- }'
- keys : string, list or tuple - list of keys or single key to return if 'all' or None, all keys are returned if list, only keys in list are returned if string, only values associated to the string(key) are returned
969 def partitions(self, mode='field', distributed=True): 970 '''return a list of available partitions (the first is highest). 971 972 *Parameters* 973 974 - **mode** : str (default 'field') - AnaDfield representation 975 ('field', 'id', 'index') 976 - **distributed** : boolean (default True) - Include only distributed fields 977 ''' 978 partit = [[fld] for fld in self.fields if fld.category == ROOTED] 979 crossed = [rel for rel in self.ana_relations if rel.typecoupl == CROSSED 980 # and rel.relation[1].index > rel.relation[0].index 981 and rel.parent_child 982 and rel.relation[0].category != COUPLED 983 and rel.relation[1].category != COUPLED] 984 if distributed: 985 crossed = [rel for rel in crossed if rel.distrib] 986 if crossed and len(crossed) == 1 and crossed[0].dist == len(self): 987 partit.insert(0, crossed[0].relation) 988 elif crossed: 989 for repeat in list(range(len(crossed))): 990 candidates = combinations(crossed, repeat + 1) 991 for candidat in candidates: 992 flds = list(set(rel.relation[i] 993 for rel in candidat for i in [0, 1])) 994 if (reduce(mul, [fld.lencodec for fld in flds]) == len(self) and 995 len(candidat) == sum(range(len(flds))) and 996 (not distributed or min(rel.distrib for rel in candidat))): 997 partit.insert(0, flds) 998 partit = Util.view(partit, mode) 999 return [list(tup) for tup in 1000 sorted(sorted(list({tuple(sorted(prt)) for prt in partit})), 1001 key=len, reverse=True)]
return a list of available partitions (the first is highest).
Parameters
- mode : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
- distributed : boolean (default True) - Include only distributed fields
1003 def field_partition(self, mode='field', partition=None, distributed=True): 1004 '''return a partition dict with the list of primary, secondary, unique 1005 and variable fields. 1006 1007 *Parameters* 1008 1009 - **mode** : str (default 'field') - AnaDfield representation 1010 ('field', 'id', 'index') 1011 - **partition** : list (default None) - if None, partition is the first 1012 - **distributed** : boolean (default True) - Include only distributed fields 1013 ''' 1014 if not partition: 1015 partitions = self.partitions(distributed=distributed) 1016 if not partitions: 1017 return {'primary': [], 'secondary': [], 'unique': [], 'variable': []} 1018 partition = partitions[0] 1019 else: 1020 partition = [self.dfield(fld) for fld in partition] 1021 secondary = [] 1022 for field in partition: 1023 self._add_child(field, secondary) 1024 secondary = [fld for fld in secondary if not fld in partition] 1025 unique = [fld for fld in self.fields if fld.category == UNIQUE] 1026 variable = [fld for fld in self.fields 1027 if not fld in partition + secondary + unique] 1028 return Util.view({'primary': partition, 'secondary': secondary, 1029 'unique': unique, 'variable': variable}, mode)
return a partition dict with the list of primary, secondary, unique and variable fields.
Parameters
- mode : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
- partition : list (default None) - if None, partition is the first
- distributed : boolean (default True) - Include only distributed fields
1031 def indicator(self, fullsize, size): 1032 '''generate size indicators: ol (object lightness), ul (unicity level), 1033 gain (sizegain) 1034 1035 *Parameters* 1036 1037 - **fullsize** : int - size with full codec 1038 - **size** : int - size with existing codec 1039 1040 *Returns* : dict''' 1041 lenindex = len(self.fields) 1042 indexlen = sum(fld.lencodec for fld in self.fields) 1043 nval = len(self) * (lenindex + 1) 1044 sval = fullsize / nval 1045 ncod = indexlen + lenindex 1046 1047 if nval != ncod: 1048 scod = (size - ncod * sval) / (nval - ncod) 1049 olight = scod / sval 1050 else: 1051 olight = None 1052 return {'total values': nval, 'mean size': round(sval, 3), 1053 'unique values': ncod, 'mean coding size': round(scod, 3), 1054 'unicity level': round(ncod / nval, 3), 1055 'optimize level': round(size / fullsize, 3), 1056 'object lightness': round(olight, 3), 1057 'maxgain': round((nval - ncod) / nval, 3), 1058 'gain': round((fullsize - size) / fullsize, 3)}
generate size indicators: ol (object lightness), ul (unicity level), gain (sizegain)
Parameters
- fullsize : int - size with full codec
- size : int - size with existing codec
Returns : dict
1070class Util: 1071 ''' common functions for analysis package''' 1072 1073 @staticmethod 1074 def view(field_struc, mode): 1075 ''' return a representation of a AnaDfields structure (fields, id, index). 1076 1077 *Parameters* 1078 1079 - **mode** : str - AnaDfield representation ('field', 'id', 'index') 1080 - **field_struc** : list or dict - structure to represent 1081 ''' 1082 if mode is None or mode == 'field' or not field_struc: 1083 return field_struc 1084 if isinstance(field_struc, dict): 1085 return {key: [fld.idfield if mode == 'id' else fld.index for fld in val] 1086 for key, val in field_struc.items()} 1087 if isinstance(field_struc, list) and isinstance(field_struc[0], list): 1088 return [[fld.idfield if mode == 'id' else fld.index for fld in val] 1089 for val in field_struc] 1090 if isinstance(field_struc, list): 1091 return [fld.idfield if mode == 'id' else fld.index for fld in field_struc] 1092 if isinstance(field_struc, AnaField): 1093 return field_struc.idfield if mode == 'id' else field_struc.index 1094 return field_struc 1095 1096 @staticmethod 1097 def reduce_dic(obj): 1098 '''return a dict without None values''' 1099 if isinstance(obj, dict): 1100 return {key: Util.reduce_dic(val) for key, val in obj.items() 1101 if not val is None} 1102 if isinstance(obj, list): 1103 return [Util.reduce_dic(val) for val in obj] 1104 return obj 1105 1106 @staticmethod 1107 def clean_dic(obj, old, new): 1108 '''return a dict or list with updated strings by replacing "old" substring 1109 with "new" substring''' 1110 if isinstance(obj, dict): 1111 return {Util.clean_dic(key, old, new): Util.clean_dic(val, old, new) 1112 for key, val in obj.items()} 1113 if isinstance(obj, str): 1114 return obj.replace(old, new) 1115 if isinstance(obj, list): 1116 return [Util.clean_dic(val, old, new) for val in obj] 1117 return obj 1118 1119 @staticmethod 1120 def filter_dic(obj, keys): 1121 '''return extract of a list of dict or of a dict 1122 1123 *Parameters* 1124 1125 - **keys** : string, list or tuple - list of keys or single key to return 1126 if 'all' or None, all keys are returned 1127 if list, only keys in list are returned 1128 if string, only values associated to the string(key) are returned''' 1129 if not keys or keys == 'all': 1130 return obj 1131 if isinstance(obj, list): 1132 return [Util.filter_dic(dic, keys) for dic in obj] 1133 if isinstance(keys, str) and isinstance(obj, dict): 1134 return obj.get(keys, None) 1135 if isinstance(keys, (list, tuple)) and isinstance(obj, dict): 1136 return {key: val for key, val in obj.items() if key in keys} 1137 return obj
common functions for analysis package
1073 @staticmethod 1074 def view(field_struc, mode): 1075 ''' return a representation of a AnaDfields structure (fields, id, index). 1076 1077 *Parameters* 1078 1079 - **mode** : str - AnaDfield representation ('field', 'id', 'index') 1080 - **field_struc** : list or dict - structure to represent 1081 ''' 1082 if mode is None or mode == 'field' or not field_struc: 1083 return field_struc 1084 if isinstance(field_struc, dict): 1085 return {key: [fld.idfield if mode == 'id' else fld.index for fld in val] 1086 for key, val in field_struc.items()} 1087 if isinstance(field_struc, list) and isinstance(field_struc[0], list): 1088 return [[fld.idfield if mode == 'id' else fld.index for fld in val] 1089 for val in field_struc] 1090 if isinstance(field_struc, list): 1091 return [fld.idfield if mode == 'id' else fld.index for fld in field_struc] 1092 if isinstance(field_struc, AnaField): 1093 return field_struc.idfield if mode == 'id' else field_struc.index 1094 return field_struc
return a representation of a AnaDfields structure (fields, id, index).
Parameters
- mode : str - AnaDfield representation ('field', 'id', 'index')
- field_struc : list or dict - structure to represent
1096 @staticmethod 1097 def reduce_dic(obj): 1098 '''return a dict without None values''' 1099 if isinstance(obj, dict): 1100 return {key: Util.reduce_dic(val) for key, val in obj.items() 1101 if not val is None} 1102 if isinstance(obj, list): 1103 return [Util.reduce_dic(val) for val in obj] 1104 return obj
return a dict without None values
1106 @staticmethod 1107 def clean_dic(obj, old, new): 1108 '''return a dict or list with updated strings by replacing "old" substring 1109 with "new" substring''' 1110 if isinstance(obj, dict): 1111 return {Util.clean_dic(key, old, new): Util.clean_dic(val, old, new) 1112 for key, val in obj.items()} 1113 if isinstance(obj, str): 1114 return obj.replace(old, new) 1115 if isinstance(obj, list): 1116 return [Util.clean_dic(val, old, new) for val in obj] 1117 return obj
return a dict or list with updated strings by replacing "old" substring with "new" substring
1119 @staticmethod 1120 def filter_dic(obj, keys): 1121 '''return extract of a list of dict or of a dict 1122 1123 *Parameters* 1124 1125 - **keys** : string, list or tuple - list of keys or single key to return 1126 if 'all' or None, all keys are returned 1127 if list, only keys in list are returned 1128 if string, only values associated to the string(key) are returned''' 1129 if not keys or keys == 'all': 1130 return obj 1131 if isinstance(obj, list): 1132 return [Util.filter_dic(dic, keys) for dic in obj] 1133 if isinstance(keys, str) and isinstance(obj, dict): 1134 return obj.get(keys, None) 1135 if isinstance(keys, (list, tuple)) and isinstance(obj, dict): 1136 return {key: val for key, val in obj.items() if key in keys} 1137 return obj
return extract of a list of dict or of a dict
Parameters
- keys : string, list or tuple - list of keys or single key to return if 'all' or None, all keys are returned if list, only keys in list are returned if string, only values associated to the string(key) are returned
Analysis Exception
Inherited Members
- builtins.Exception
- Exception
- builtins.BaseException
- with_traceback