tab-analysis.tab_analysis.analysis
This module analyses structure and relationships included in a tabular object (Pandas DataFrame, Dataset, list of list) :
- Structure of a single field (class
AnaField
), - Relationship between two fields (class
AnaRelation
) - Structure and relationships of fields inside a dataset (class
AnaDfield
) - Structure of a dataset (class
AnaDataset
)
1# -*- coding: utf-8 -*- 2""" 3This module analyses structure and relationships included in a tabular object 4(Pandas DataFrame, Dataset, list of list) : 5- Structure of a single field (class `AnaField`), 6- Relationship between two fields (class `AnaRelation`) 7- Structure and relationships of fields inside a dataset (class `AnaDfield`) 8- Structure of a dataset (class `AnaDataset`) 9 10It contains two another classes `Util`, `AnaError`. 11""" 12import json 13import pprint 14from itertools import combinations 15from operator import mul 16from functools import reduce 17 18NULL = 'null' 19UNIQUE = 'unique' 20COMPLETE = 'complete' 21FULL = 'full' 22DEFAULT = 'default' 23MIXED = 'mixed' 24 25COUPLED = 'coupled' 26DERIVED = 'derived' 27LINKED = 'linked' 28CROSSED = 'crossed' 29DISTRIBUTED = 'distributed' 30ROOTED = 'rooted' 31ROOT = 'root' 32 33IDFIELD = 'id' 34MINCODEC = 'mincodec' 35MAXCODEC = 'maxcodec' 36LENCODEC = 'lencodec' 37RATECODEC = 'ratecodec' 38DMINCODEC = 'dmincodec' 39DMAXCODEC = 'dmaxcodec' 40RANCODEC = 'rancodec' 41TYPECODEC = 'typecodec' 42HASHF = 'hashf' 43RELATION = 'relation' 44HASHR = 'hashr' 45DIST = 'dist' 46DMAX = 'dmax' 47DMIN = 'dmin' 48DIFF = 'diff' 49DRAN = 'dran' 50NUM = 'num' 51CATEGORY = 'category' 52PDERIVED = 'pderived' 53PDISTANCE = 'pdistance' 54PDISTOMIN = 'pdistomin' 55DISDISTANCE = 'disdistance' 56DERDISTANCE = 'derdistance' 57DISRATECPL = 'disratecpl' 58DERRATECPL = 'derratecpl' 59DISRATEDER = 'disrateder' 60DERRATEDER = 'derrateder' 61 62TYPECOUPL = 'typecoupl' 63PARENTCHILD = 'parentchild' 64DISTANCE = 'distance' 65DISTOMIN = 'distomin' 66DISTOMAX = 'distomax' 67DISTROOT = 'distroot' 68RATECPL = 'ratecpl' 69RATEDER = 'rateder' 70 71IDDATASET = 'name' 72RELATIONS = 'relations' 73FIELDS = 'fields' 74LENGTH = 'length' 75HASHD = 'hashd' 76 77 78class AnaField: 79 '''This class analyses field entities. 80 81 *Attributes* 82 83 - **idfield** : string - name or Id of the field 84 - **lencodec**: integer - codec length 85 - **mincodec**: integer - minimal codec length 86 - **maxcodec**: integer - minimal codec length 87 - **hashf**: integer - hash value to identify modifications 88 89 *characteristic (@property)* 90 91 - `iscomplete` 92 - `ratecodec` 93 - `dmincodec` 94 - `dmaxcodec` 95 - `rancodec` 96 - `typecodec` 97 98 *instance methods* 99 100 - `to_dict` 101 102 ''' 103 104 def __init__(self, idfield, lencodec=None, mincodec=None, maxcodec=None, hashf=None): 105 '''Creation mode : 106 - single dict attribute where keys are attributes name, 107 - single AnaField attribute to make a copy 108 - multiple attributes 109 110 *Parameters (multiple attributes)* 111 112 - **idfield** : string or integer - Id of the Field 113 - **lencodec** : integer (default None) - length of the codec 114 - **mincodec** : integer (default None) - number of different values 115 - **maxcodec** : integer (default None) - length of the field 116 - **hashf** : string (default None) - update identifier 117 118 *example* 119 120 AnaField is created with a dict 121 >>> AnaField(Cfield([1,2,3,3]).to_analysis).to_dict() 122 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 123 >>> AnaField({'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}) 124 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 125 126 AnaField is created with parameters 127 >>> AnaField(lencodec=4, mincodec=3, maxcodec=4).to_dict() 128 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 129 >>> AnaField(4, 3, 4).to_dict() 130 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 131 ''' 132 if isinstance(idfield, dict): 133 self.idfield = idfield.get(IDFIELD, None) 134 self.lencodec = idfield.get(LENCODEC, None) 135 self.mincodec = idfield.get(MINCODEC, None) 136 self.maxcodec = idfield.get(MAXCODEC, None) 137 self.hashf = idfield.get(HASHF, None) 138 return 139 if isinstance(idfield, (AnaField, AnaDfield)): 140 self.idfield = idfield.idfield 141 self.lencodec = idfield.lencodec 142 self.mincodec = idfield.mincodec 143 self.maxcodec = idfield.maxcodec 144 self.hashf = idfield.hashf 145 return 146 if not lencodec or not isinstance(lencodec, int): 147 raise AnaError("lencodec is not correct") 148 self.idfield = idfield 149 self.lencodec = lencodec 150 self.mincodec = mincodec 151 self.maxcodec = maxcodec 152 self.hashf = hashf 153 154 def __len__(self): 155 '''length of the field (maxcodec)''' 156 return self.maxcodec if self.maxcodec else self.lencodec 157 158 def __repr__(self): 159 '''representation of the field (class name + idfield)''' 160 return self.__class__.__name__ + '(' + str(self.idfield) + ')' 161 162 def __eq__(self, other): 163 ''' equal if class and attributes are equal''' 164 return self.__class__ .__name__ == other.__class__.__name__ and \ 165 self.idfield == other.idfield and self.lencodec == other.lencodec and \ 166 self.mincodec == other.mincodec and self.maxcodec == other.maxcodec and \ 167 self.hashf == other.hashf 168 169 def __lt__(self, other): 170 ''' return a comparison between hash value''' 171 return hash(self) < hash(other) 172 173 def __hash__(self): 174 '''return hash value (sum of attributes hash)''' 175 return hash(self.idfield) + hash(self.lencodec) + hash(self.mincodec) \ 176 + hash(self.maxcodec) + hash(self.hashf) 177 178 def __str__(self): 179 '''json-text build with the attributes dict''' 180 return json.dumps(self.to_dict(idfield=True)) 181 182 def __copy__(self): 183 ''' Copy all the attributes ''' 184 return self.__class__(self) 185 186 def to_dict(self, full=False, idfield=False, notnone=True): 187 '''return a dict with field attributes. 188 189 *Parameters* 190 191 - **full** : boolean (default False) - if True, all the attributes are included 192 - **idfield** : boolean (default False) - if True, idfield is included 193 - **notnone** : boolean (default True) - if True, None values are not included 194 ''' 195 dic = {LENCODEC: self.lencodec, MINCODEC: self.mincodec, 196 MAXCODEC: self.maxcodec} 197 if idfield or full: 198 dic[IDFIELD] = self.idfield 199 if full: 200 dic |= {RATECODEC: self.ratecodec, DMINCODEC: self.dmincodec, 201 DMAXCODEC: self.dmaxcodec, RANCODEC: self.rancodec, 202 TYPECODEC: self.typecodec} 203 if notnone: 204 return Util.reduce_dic(dic) 205 return dic 206 207 @property 208 def iscomplete(self): 209 '''return boolean indicator : True if all attributes are present''' 210 return not self.maxcodec is None and not self.mincodec is None 211 212 @property 213 def ratecodec(self): 214 '''return float ratecodec indicator''' 215 if self.iscomplete and self.maxcodec - self.mincodec: 216 return (self.maxcodec - self.lencodec) / (self.maxcodec - self.mincodec) 217 return None 218 219 @property 220 def dmincodec(self): 221 '''return integer dmincodec indicator''' 222 return self.lencodec - self.mincodec if self.iscomplete else None 223 224 @property 225 def dmaxcodec(self): 226 '''return integer dmaxcodec indicator''' 227 return self.maxcodec - self.lencodec if self.iscomplete else None 228 229 @property 230 def rancodec(self): 231 '''return integer rancodec indicator''' 232 return self.maxcodec - self.mincodec if self.iscomplete else None 233 234 @property 235 def typecodec(self): 236 '''return string typecodec indicator 237 (null, unique, complete, full, default, mixed) 238 ''' 239 if self.maxcodec is None or self.mincodec is None: 240 return None 241 if self.maxcodec == 0: 242 return NULL 243 if self.lencodec == 1: 244 return UNIQUE 245 if self.mincodec == self.maxcodec: 246 return COMPLETE 247 if self.lencodec == self.maxcodec: 248 return FULL 249 if self.lencodec == self.mincodec: 250 return DEFAULT 251 return MIXED 252 253 254class AnaRelation: 255 '''This class analyses relationship between two fields 256 257 *Attributes* : 258 259 - **relation** : List of the two fields involved in the relationship 260 - **dist** : value of the relationship 261 - **distrib** : boolean True if values are distributed 262 - **hashr**: integer - hash value to identify update 263 264 *global (@property)* 265 266 - `id_relation` 267 - `index_relation` 268 - `parent_child` 269 - `typecoupl` 270 271 *characteristic (@property)* 272 273 - `dmax` 274 - `dmin` 275 - `diff` 276 - `dran` 277 - `distomin` 278 - `distomax` 279 - `distance` 280 - `ratecpl` 281 - `rateder` 282 283 *instance methods* 284 285 - `to_dict` 286 ''' 287 288 def __init__(self, relation, dists, hashr=None): 289 '''Constructor of the relationship : 290 291 *Parameters* 292 293 - **relation** : List of the two fields involved in the relationship 294 - **dists** : dist value or list of dist value and distrib boolean 295 - **distrib** : boolean True if values are distributed 296 - **hashr**: integer - hash value to identify update 297 ''' 298 self.relation = relation 299 if isinstance(dists, list): 300 self.dist = dists[0] 301 self.distrib = dists[1] 302 else: 303 self.dist = dists 304 self.distrib = None 305 self.hashr = hashr 306 307 def __repr__(self): 308 '''representation of the field (class name + idfield)''' 309 return self.__class__.__name__ + '(' + str(self.id_relation) + ')' 310 311 def __str__(self): 312 '''json-text build with the attributes dict''' 313 return json.dumps(self.to_dict(relation=True)) 314 315 def __eq__(self, other): 316 ''' equal if class and values are equal''' 317 return self.__class__ .__name__ == other.__class__.__name__ and \ 318 self.relation == other.relation and self.dist == other.dist and \ 319 self.hashr == other.hashr and self.distrib == other.distrib 320 321 def __hash__(self): 322 '''return hash value (sum of attributes hash)''' 323 return hash(self.relation[0]) + hash(self.relation[1]) + \ 324 hash(self.dist) + hash(self.hashr) + hash(self.distrib) 325 326 def to_dict(self, distances=False, full=False, mode='field', relation=False, 327 notnone=True, misc=False): 328 '''return a dict with AnaRelation attributes. 329 330 *Parameters* 331 332 - **distances** : boolean (default False) - if True, distances indicators are included 333 - **full** : boolean (default False) - if True, all the attributes are included 334 - **relation** : boolean (default False) - if True, idfield are included 335 - **notnone** : boolean (default True) - if True, None values are not included 336 - **mode** : str (default 'field') - AnaDfield representation ('field', 'id', 'index') 337 ''' 338 dic = {DIST: self.dist, TYPECOUPL: self.typecoupl, HASHR: self.hashr} 339 if relation or full: 340 dic[RELATION] = Util.view(self.relation, mode) 341 dic[PARENTCHILD] = self.parent_child 342 if distances or full: 343 dic |= {DISTANCE: self.distance, DISTOMIN: self.distomin, 344 DISTOMAX: self.distomax, DISTRIBUTED: self.distrib, 345 RATECPL: self.ratecpl, RATEDER: self.rateder} 346 if misc or full: 347 dic |= {DMAX: self.dmax, DMIN: self.dmin, 348 DIFF: self.diff, DRAN: self.dran} 349 if notnone: 350 return Util.reduce_dic(dic) 351 return dic 352 353 @property 354 def id_relation(self): 355 '''return a list with the id of the two fields involved''' 356 if self.relation: 357 return [fld.idfield for fld in self.relation] 358 return [] 359 360 @property 361 def parent_child(self): 362 '''returns the direction of the relationship (True if parent is first)''' 363 rel0 = self.relation[0] 364 rel1 = self.relation[1] 365 return (rel0.lencodec > rel1.lencodec or 366 (rel0.lencodec == rel1.lencodec and rel0.index < rel1.index)) 367 368 @property 369 def index_relation(self): 370 '''return a list with the index of the two fields involved''' 371 if self.relation: 372 return [fld.index for fld in self.relation] 373 return [] 374 375 @property 376 def dmax(self): 377 '''return integer dmax indicator''' 378 return self.relation[0].lencodec * self.relation[1].lencodec 379 380 @property 381 def dmin(self): 382 '''return integer dmin indicator''' 383 return max(self.relation[0].lencodec, self.relation[1].lencodec) 384 385 @property 386 def diff(self): 387 '''return integer diff indicator''' 388 return abs(self.relation[0].lencodec - self.relation[1].lencodec) 389 390 @property 391 def dran(self): 392 '''return integer dran indicator''' 393 return self.dmax - self.dmin 394 395 @property 396 def distomin(self): 397 '''return integer distomin indicator''' 398 return self.dist - self.dmin 399 400 @property 401 def distomax(self): 402 '''return integer distomax indicator''' 403 return self.dmax - self.dist 404 405 @property 406 def distance(self): 407 '''return integer distance indicator''' 408 return self.distomin + self.diff 409 410 @property 411 def ratecpl(self): 412 '''return float ratecpl indicator''' 413 disdis = self.distance + self.distomax 414 return 0 if disdis == 0 else self.distance / disdis 415 416 @property 417 def rateder(self): 418 '''return float rateder indicator''' 419 return 0 if self.dran == 0 else self.distomin / self.dran 420 421 @property 422 def typecoupl(self): 423 '''return relationship type (coupled, derived, crossed, linked)''' 424 if self.distance == 0: 425 return COUPLED 426 if self.distomin == 0: 427 return DERIVED 428 if self.distomax == 0: 429 return CROSSED 430 return LINKED 431 432 433class AnaDfield(AnaField): 434 '''This class analyses structure and relationships of fields inside a dataset 435 436 *Attributes* : 437 438 - **dataset** : AnaDataset object where AnaDfield is included 439 - **AnaField attributes** : inheritance of AnaField object 440 441 *relationship (@property)* 442 443 - `list_relations` 444 - `list_p_derived` 445 - `list_c_derived` 446 - `list_coupled` 447 448 *field (@property)* 449 450 - `fields` 451 - `p_derived` 452 - `p_distance` 453 - `p_distomin` 454 455 *global (@property)* 456 457 - `index` 458 - `dist_root` 459 - `category` 460 461 *global (instance methods)* 462 463 - `ascendants` 464 - `to_dict` 465 - `view` 466 467 *other instance methods* 468 469 - `dic_inner_node` 470 ''' 471 def __new__(cls, other, dataset=None): 472 '''initialization of attributes from "other"''' 473 if isinstance(other, AnaDfield): 474 new = AnaDfield.__copy__(other) 475 return new 476 if isinstance(other, AnaField): 477 new = AnaField.__copy__(other) 478 new.__class__ = AnaDfield 479 return new 480 return object.__new__(cls) 481 482 def __init__(self, other, dataset): 483 '''AnaDfield is created by adding a AnaDataset link to an AnaField object. 484 485 *Parameters* 486 487 - **other** : AnaField or AnaDfield to initialize attributes 488 - **dataset** : AnaDataset which includes the AnaDfield 489 ''' 490 self.dataset = dataset 491 492 def __copy__(self): 493 ''' Copy all the data ''' 494 return self.__class__(AnaField(self), self.dataset) 495 496 def __lt__(self, other): 497 ''' return a comparison between field index''' 498 return self.index < other.index 499 500 @property 501 def index(self): 502 '''return the row of the field in the AnaDataset''' 503 if self == self.dataset.root: 504 return -1 505 return self.dataset.fields.index(self) 506 507 @property 508 def fields(self): 509 '''return the list of the fields included in the AnaDataset''' 510 return self.dataset.fields 511 512 @property 513 def list_relations(self): 514 '''return the list of the relations with the AnaDfield''' 515 return list(self.dataset.relations[self].values()) 516 517 @property 518 def list_p_derived(self): 519 '''return the list of the derived relations with the parents of AnaDfield''' 520 return [rel for rel in self.list_relations if rel.typecoupl == DERIVED 521 and not rel.parent_child] 522 523 @property 524 def list_c_derived(self): 525 '''return the list of the derived relations with the childs of AnaDfield''' 526 return [rel for rel in self.list_relations if rel.typecoupl == DERIVED 527 and rel.parent_child 528 and rel.relation[1].category != UNIQUE] 529 530 @property 531 def list_coupled(self): 532 '''return the list of the coupled relations with the AnaDfield''' 533 return [rel for rel in self.list_relations if rel.typecoupl == COUPLED] 534 535 @property 536 def dist_root(self): 537 '''return the distance to the root field''' 538 return len(self.dataset) - self.lencodec 539 540 @property 541 def category(self): 542 '''return AnaDfield category (unique, rooted, coupled, derived, mixed)''' 543 if self.typecodec == UNIQUE: 544 return UNIQUE 545 if self.typecodec in (COMPLETE, FULL): 546 return ROOTED 547 if COUPLED in [rel.typecoupl for rel in self.list_relations 548 if not rel.parent_child]: 549 return COUPLED 550 if not self.list_c_derived: 551 return DERIVED 552 return MIXED 553 554 @property 555 def p_derived(self): 556 '''return the first derived or coupled parent of the AnaDfield''' 557 if self.category in (UNIQUE, ROOTED): 558 return self.dataset.root 559 if self.category == COUPLED: 560 return [rel.relation[1] for rel in self.list_coupled 561 if not rel.relation[1].category == COUPLED][0] 562 if not self.list_p_derived: 563 return self.dataset.root 564 distance_min = min(rel.distance for rel in self.list_p_derived) 565 for rel in self.list_p_derived: 566 if rel.distance == distance_min: 567 if rel.relation[1].category == ROOTED: 568 return self.dataset.root 569 if rel.relation[1].category == MIXED: 570 return rel.relation[1] 571 return self.dataset.root 572 573 @property 574 def p_distance(self): 575 '''return the first parent with minimal distance of the AnaDfield''' 576 return self._p_min_dist() 577 578 @property 579 def p_distomin(self): 580 '''return the first parent with minimal distomin of the AnaDfield''' 581 return self._p_min_dist(False) 582 583 def _p_min_dist(self, distance=True): 584 '''return the parent with minimal distance of the AnaDfield''' 585 if self.category == UNIQUE: 586 return self.dataset.root 587 if distance: 588 dist_up = [rel.distance for rel in self.list_relations if 589 not rel.parent_child] 590 else: 591 dist_up = [rel.distomin for rel in self.list_relations if 592 not rel.parent_child] 593 if not dist_up or min(dist_up) == self.dist_root: 594 return self.dataset.root 595 dist_min = min(dist_up) 596 if distance: 597 list_dmin = [rel.relation[1] for rel in self.list_relations 598 if rel.distance == dist_min] 599 else: 600 list_dmin = [rel.relation[1] for rel in self.list_relations 601 if rel.distomin == dist_min] 602 max_lencodec = max(fld.lencodec for fld in list_dmin) 603 return [fld for fld in list_dmin if fld.lencodec == max_lencodec][0] 604 605 def to_dict(self, mode='id'): 606 '''return a dict with field attributes. 607 608 *Parameters* 609 610 - **mode** : str (default 'id') - AnaDfield representation ('field', 'id', 'index') 611 ''' 612 dic = super().to_dict(full=True, idfield=False, notnone=False) 613 dic[DISTROOT] = self.dist_root 614 dic[NUM] = self.index 615 dic[CATEGORY] = self.category 616 dic[PDISTANCE] = self.p_distance.view(mode) 617 dic[PDISTOMIN] = self.p_distomin.view(mode) 618 dic[PDERIVED] = self.p_derived.view(mode) 619 return dic 620 621 def view(self, mode='field'): 622 ''' return a representation of the AnaDfield 623 624 *Parameters* 625 626 - **mode** : str (default 'field') - AnaDfield representation ('field', 'id', 'index') 627 ''' 628 return Util.view(self, mode) 629 630 def ascendants(self, typeparent='derived', mode='field'): 631 ''' return the list of the AnaDfield's ascendants in the family tree up to 632 the root AnaDfield. 633 634 *Parameters* 635 636 - **typeparent** : str (default 'derived') - 'derived', 'distance' or 'distomin' 637 - **mode** : str (default 'field') - AnaDfield representation 638 ('field', 'id', 'index') 639 640 *Returns* : list of parents from closest to the most distant. Parents 641 are represented with index, idfield, or object 642 ''' 643 parent = self 644 listparent = [] 645 while parent != self.dataset.root: 646 if typeparent == 'derived': 647 parent = parent.p_derived 648 elif typeparent == 'distance': 649 parent = parent.p_distance 650 else: 651 parent = parent.p_distomin 652 if parent != self.dataset.root: 653 listparent.append(parent) 654 return Util.view(listparent, mode) 655 656 def dic_inner_node(self, mode, lname): 657 '''return a child AnaDfield tree. 658 659 *Parameters* 660 661 - **lname** : integer - maximal length of the names 662 - **mode** : string (default 'derived') - kind of tree : 663 'derived' : derived tree 664 'distance': min distance tree 665 'distomin': min distomin tree 666 667 *Returns* : dict where key is a AnaDfield and value is the list of 668 the childs "name ( dist - lencodec)". 669 ''' 670 adding = '' 671 if mode == 'distance': 672 rel_parent = self.dataset.get_relation(self, self.p_distance) 673 adding = str(rel_parent.distance) + ' - ' 674 elif mode == 'distomin': 675 rel_parent = self.dataset.get_relation(self, self.p_distomin) 676 adding = str(rel_parent.distomin) + ' - ' 677 elif mode == 'derived': 678 rel_parent = self.dataset.get_relation(self, self.p_derived) 679 adding = str(rel_parent.distance) + ' - ' 680 adding += str(self.lencodec) 681 name = str(self.idfield)[:lname] + ' (' + adding + ')' 682 lis = [name.replace(' ', '*').replace("'", '*')] 683 if mode == 'derived': 684 childs = [] 685 if not self.category in (ROOTED, COUPLED, UNIQUE): 686 for rel in self.list_coupled: 687 lis.append(rel.relation[1].dic_inner_node(mode, lname)) 688 if not self.category in (ROOTED, UNIQUE): 689 childs = [rel.relation[1] for rel in self.list_relations 690 if rel.relation[1].p_derived == self and 691 rel.relation[1].category != COUPLED] 692 if mode == 'distomin': 693 childs = [rel.relation[1] for rel in self.list_relations 694 if rel.relation[1].p_distomin == self] 695 if mode == 'distance': 696 childs = [rel.relation[1] for rel in self.list_relations 697 if rel.relation[1].p_distance == self] 698 for fld in childs: 699 lis.append(fld.dic_inner_node(mode, lname)) 700 return {str(self.index).ljust(2, '*'): lis} 701 702 703class AnaDataset: 704 '''This class analyses the structure of a dataset. 705 706 *Attributes* : 707 708 - **iddataset** : string or integer - Id of the Dataset 709 - **fields** : list of the AnaDfields included 710 - **relations** : dict of the AnaRelations between two AnaDfields 711 - **hashd** : string - update identifier 712 713 *relationship (@property)* 714 715 - `ana_relations` 716 - `p_relations` 717 718 *field (@property)* 719 720 - `root` 721 - `primary` 722 - `secondary` 723 - `unique` 724 - `mixte` 725 - `variable` 726 727 *global (@property)* 728 729 - `category` 730 - `complete` 731 - `dimension` 732 733 *update (instance methods)* 734 735 - `set_relations` 736 737 *access (instance methods)* 738 739 - `get_relation` 740 - `dfield` 741 742 *synthesis (instance methods)* 743 744 - `tree` 745 - `to_dict` 746 - `indicator` 747 - `partitions` 748 - `field_partition` 749 - `relation_partition` 750 ''' 751 752 def __init__(self, fields=None, relations=None, iddataset=None, 753 leng=None, hashd=None): 754 '''Creation mode : 755 - single dict attribute where keys are attributes name, 756 - single AnaDataset attribute to make a copy 757 - multiple attributes 758 759 *Parameters (single dict)* 760 761 - **fields**: {'fields': list_of_dict, 'name': id_dataset, 762 'length': length, 'relations': dict_of_relations 763 where: 764 list_of_dict : {'id': id_field, 'lencodec': len_codec, 'mincodec': min_codec} 765 id_field: string - name of field 766 other_field: string - name of field 767 len_codec: int - length of the codec 768 min_codec: int - number of different codec values 769 id_dataset : name of the dataset 770 length: int - length of the dataset 771 dict_of_relations: {id_field : {other_field: dist} for all fields} 772 field: name of a field 773 field_other: name of another field 774 dist: integer (distance between the two fields) or 775 array (distance and boolean distributed) 776 777 *Parameters (multiple attributes)* 778 779 - **fields**: list_of_dict 780 - **iddataset** : string (default None) - id_dataset 781 - **relations** : dict (default None) - dict_of_relations 782 - **leng** : int (default None) - length 783 - **hashd** : string (default None) - update identifier 784 ''' 785 if isinstance(fields, AnaDataset): 786 self.iddataset = fields.iddataset 787 self.fields = fields.fields 788 self.relations = fields.relations 789 self.hashd = fields.hashd 790 return 791 if isinstance(fields, dict): 792 iddataset = fields.get(IDDATASET, None) 793 leng = fields.get(LENGTH, None) 794 relations = fields.get(RELATIONS, None) 795 hashd = fields.get(HASHD) 796 fields = fields.get(FIELDS, None) 797 self.iddataset = iddataset 798 self.fields = [AnaDfield(AnaField(field), self) 799 for field in fields] if fields else [] 800 if leng: 801 for fld in self.fields: 802 fld.maxcodec = leng 803 self.relations = {field: {} for field in self.fields} 804 if relations: 805 for fld, dic_relation in relations.items(): 806 self.set_relations(fld, dic_relation) 807 self.hashd = hashd 808 809 def __len__(self): 810 '''length of the AnaDataset (len of the AnaDfields included)''' 811 return max(len(fld) for fld in self.fields) 812 813 def __eq__(self, other): 814 ''' equal if class and values are equal''' 815 return self.__class__ .__name__ == other.__class__.__name__ and \ 816 self.fields == other.fields and self.relations == other.relations and \ 817 self.iddataset == other.iddataset and self.hashd == other.hashd 818 819 def __hash__(self): 820 '''return hash value (sum of attributes hash)''' 821 return hash(self.iddataset) + sum(hash(fld) for fld in self.fields) + \ 822 sum(hash(rel) for rel in self.relations) + hash(self.hashd) 823 824 @property 825 def category(self): 826 '''return a list of AnaDfield category (unique, rooted, coupled, derived, mixed)''' 827 return [fld.category for fld in self.fields] 828 829 @property 830 def ana_relations(self): 831 '''return the list of AnaRelation included''' 832 return [rel for fldrel in self.relations.values() for rel in fldrel.values()] 833 834 @property 835 def p_relations(self): 836 '''return the list of oriented AnaRelation (parent first, child second)''' 837 return [rel for rel in self.ana_relations if rel.parent_child] 838 839 @property 840 def root(self): 841 '''return the root AnaDfield''' 842 len_self = len(self) 843 return AnaDfield(AnaField(ROOT, len_self, len_self, len_self), self) 844 845 @property 846 def primary(self): 847 '''return the first partition of the partitions''' 848 return self.field_partition(mode='field')['primary'] 849 # part = self.partitions(mode='field', distributed=True) 850 # return part[0] if part else [] 851 852 @property 853 def complete(self): 854 '''return True if the dimension is not 0''' 855 return self.dimension > 0 856 857 @property 858 def dimension(self): 859 '''return the highest partition lenght''' 860 return len(self.primary) 861 862 @property 863 def secondary(self): 864 '''return the derived ou coupled fields from primary''' 865 return self.field_partition(mode='field')['secondary'] 866 867 @property 868 def unique(self): 869 '''return the unique fields''' 870 return [fld for fld in self.fields if fld.category == UNIQUE] 871 872 @property 873 def variable(self): 874 '''return the variable fields''' 875 return self.field_partition(mode='field')['variable'] 876 877 @property 878 def mixte(self): 879 '''return the variable fields''' 880 return self.field_partition(mode='field')['mixte'] 881 882 def set_relations(self, field, dic_relations): 883 '''Add relations in the AnaDataset from a dict. 884 885 *Parameters* 886 887 - **field** : AnaDfield, AnaField or str (idfield) - first relation AnaDfield 888 - **dic_relations** : dict - key is the second relation AnaDfield and 889 value is the dist value or teh list [dist, distrib] 890 ''' 891 fld = self.dfield(field) 892 for other, dist in dic_relations.items(): 893 oth = self.dfield(other) 894 self.relations[fld][oth] = AnaRelation([fld, oth], dist) 895 self.relations[oth][fld] = AnaRelation([oth, fld], dist) 896 897 def get_relation(self, fld1, fld2): 898 '''Return AnaRelation between fld1 and fld2. 899 900 *Parameters* 901 902 - **fld1** : AnaDfield, AnaField, int or str (idfield) - first relation AnaDfield 903 - **fld2** : AnaDfield, AnaField, int or str (idfield) - second relation AnaDfield 904 ''' 905 fl1 = self.dfield(fld1) 906 fl2 = self.dfield(fld2) 907 if self.root in [fl1, fl2]: 908 return AnaRelation([fl1, fl2], len(self)) 909 return self.relations[self.dfield(fld1)][self.dfield(fld2)] 910 911 def dfield(self, fld): 912 '''return the AnaDfield matching with fld. Fld is str, int, AnaDfield or AnaField''' 913 if fld in (-1, ROOT): 914 return self.root 915 if isinstance(fld, AnaDfield): 916 return fld 917 if isinstance(fld, int): 918 return self.fields[fld] 919 if isinstance(fld, str): 920 if fld in [dfld.idfield for dfld in self.fields]: 921 return [dfld for dfld in self.fields if dfld.idfield == fld][0] 922 return None 923 return AnaDfield(fld, self) 924 925 def tree(self, mode='derived', width=5, lname=20, string=True): 926 '''return a string with a tree of derived Field. 927 928 *Parameters* 929 930 - **lname** : integer (default 20) - length of the names 931 - **width** : integer (default 5) - length of the lines 932 - **string** : boolean (default True) - if True return str else return dict 933 - **mode** : string (default 'derived') - kind of tree : 934 'derived' : derived tree 935 'distance': min distance tree 936 'distomin': min distomin tree 937 ''' 938 lis = ['root-' + mode + '*(' + str(len(self)) + ')'] 939 if mode == 'distance': 940 childs = [fld for fld in self.fields if fld.p_distance == self.root] 941 elif mode == 'distomin': 942 childs = [fld for fld in self.fields if fld.p_distomin == self.root] 943 elif mode == 'derived': 944 childs = [fld for fld in self.fields if fld.p_derived == self.root] 945 for fld in childs: 946 lis.append(fld.dic_inner_node(mode, lname)) 947 tree = {str(-1).ljust(2, '*'): lis} 948 if string: 949 tre = pprint.pformat(tree, indent=0, width=width) 950 tre = tre.replace('---', ' - ') 951 tre = tre.replace(' ', ' ') 952 tre = tre.replace('*', ' ') 953 for car in ["'", "\"", "{", "[", "]", "}", ","]: 954 tre = tre.replace(car, "") 955 return tre 956 return Util.clean_dic(tree, '*', ' ') 957 958 def to_dict(self, mode='field', keys=None, relations=False): 959 '''return a dict with fields attributes and optionaly relations attributes. 960 961 *Parameters* 962 963 - **mode** : str (default 'field') - AnaDfield representation 964 ('field', 'id', 'index') 965 - **relations** : boolean (default: False) - if False return a list of fields, 966 if True return a dict '{"fields": <list of fields>, "relations": <list of relations>}' 967 - **keys** : string, list or tuple - list of keys or single key to return 968 if 'all' or None, all keys are returned 969 if list, only keys in list are returned 970 if string, only values associated to the string(key) are returned''' 971 fields = Util.filter_dic([fld.to_dict(mode=mode) 972 for fld in self.fields], keys) 973 leng = len(self.fields) 974 if not relations: 975 return fields 976 return {'fields': fields, 'relations': 977 [self.get_relation(i, j).to_dict(full=True, mode=mode) 978 for i in range(-1, leng) for j in range(i + 1, leng)]} 979 980 def partitions(self, mode='id', distributed=True): 981 '''return a list of available partitions (the first is highest). 982 983 *Parameters* 984 985 - **mode** : str (default 'id') - AnaDfield representation 986 ('field', 'id', 'index') 987 - **distributed** : boolean (default True) - Include only distributed fields 988 ''' 989 partit = [[fld] for fld in self.fields if fld.category == ROOTED] 990 crossed = [rel for rel in self.ana_relations if rel.typecoupl == CROSSED 991 and rel.parent_child 992 and rel.relation[0].category != COUPLED 993 and rel.relation[1].category != COUPLED] 994 if distributed: 995 crossed = [rel for rel in crossed if rel.distrib] 996 if crossed and len(crossed) == 1 and crossed[0].dist == len(self): 997 partit.insert(0, crossed[0].relation) 998 elif crossed: 999 for repeat in list(range(len(crossed))): 1000 candidates = combinations(crossed, repeat + 1) 1001 for candidat in candidates: 1002 flds = list(set(rel.relation[i] 1003 for rel in candidat for i in [0, 1])) 1004 if (reduce(mul, [fld.lencodec for fld in flds]) == len(self) and 1005 len(candidat) == sum(range(len(flds))) and 1006 (not distributed or min(rel.distrib for rel in candidat))): 1007 partit.insert(0, flds) 1008 partit = [list(tup) for tup in 1009 sorted(sorted(list({tuple(sorted(prt)) for prt in partit})), 1010 key=len, reverse=True)] 1011 return Util.view(partit, mode) 1012 1013 def field_partition(self, mode='id', partition=None, distributed=True): 1014 '''return a partition dict with the list of primary, secondary, unique 1015 and variable fields. 1016 1017 *Parameters* 1018 1019 - **mode** : str (default 'id') - AnaDfield representation 1020 ('field', 'id', 'index') 1021 - **partition** : list of str, int, AnaDfield or AnaField(default None) - 1022 if None, partition is the first 1023 - **distributed** : boolean (default True) - Include only distributed fields 1024 ''' 1025 partitions = self.partitions(mode='field', distributed=distributed) 1026 if not partitions: 1027 return Util.view( 1028 {'primary': [], 'secondary': [ 1029 fld for fld in self.fields if fld.category != UNIQUE], 1030 'mixte': [], 'unique': [ 1031 fld for fld in self.fields if fld.category == UNIQUE], 1032 'variable': []}, mode) 1033 if not partition: 1034 partition = partitions[0] 1035 else: 1036 # partition = [self.dfield(fld) for fld in tuple(sorted(partition))] 1037 partition = [self.dfield(fld) for fld in tuple(partition)] 1038 secondary = [] 1039 for field in partition: 1040 self._add_child(field, secondary) 1041 secondary = [fld for fld in secondary if not fld in partition] 1042 unique = [fld for fld in self.fields if fld.category == UNIQUE] 1043 mixte = list(self._mixte_dims(partition, partitions)) 1044 variable = [fld for fld in self.fields 1045 if not fld in partition + secondary + unique + mixte] 1046 return Util.view({'primary': partition, 'secondary': secondary, 1047 'mixte': mixte, 'unique': unique, 1048 'variable': variable}, mode) 1049 1050 def relation_partition(self, partition=None, primary=False, noroot=False): 1051 '''return a dict with the list of relationships for fields in a partition. 1052 1053 *Parameters* 1054 1055 - **partition** : list (default None) - if None, partition is the first 1056 - **primary** : boolean (default False) - if True, relations are primary fields 1057 - **noroot** : boolean (default False) - if True and single primary, 1058 'root' field is replaced by the primary field''' 1059 partitions = self.partitions(mode='field') 1060 if not partitions: 1061 partition = None 1062 else: 1063 partition = Util.view(partition, mode='field', 1064 ana=self) if partition else partitions[0] 1065 part = self.field_partition( 1066 mode='field', partition=partition, distributed=True) 1067 fields_cat = {fld: cat for cat, l_fld in part.items() for fld in l_fld} 1068 relations = {} 1069 for field in fields_cat: 1070 rel = [] 1071 match fields_cat[field]: 1072 case 'primary': 1073 rel = [field.idfield] 1074 case 'unique': ... 1075 case 'variable': 1076 rel = [fld.idfield for fld in part['primary']] 1077 case 'secondary' if not primary: 1078 rel = [field.p_derived.idfield] 1079 case 'secondary' if primary: 1080 rel = [fld.idfield for fld in field.ascendants() 1081 if fld in part['primary']] 1082 case 'mixte': 1083 rel = [fld.idfield for fld in self._mixte_dims( 1084 partition, partitions)[field]] 1085 case _: ... 1086 if rel == ['root'] and len(part['primary']) == 1 and noroot: 1087 rel = [part['primary'][0].idfield] 1088 if rel == ['root'] and len(part['primary']) == 0 and noroot: 1089 rel = [part['secondary'][0].idfield] 1090 relations[field.idfield] = rel 1091 return relations 1092 1093 def indicator(self, fullsize, size): 1094 '''generate size indicators: ol (object lightness), ul (unicity level), 1095 gain (sizegain) 1096 1097 *Parameters* 1098 1099 - **fullsize** : int - size with full codec 1100 - **size** : int - size with existing codec 1101 1102 *Returns* : dict''' 1103 lenindex = len(self.fields) 1104 indexlen = sum(fld.lencodec for fld in self.fields) 1105 nval = len(self) * (lenindex + 1) 1106 sval = fullsize / nval 1107 ncod = indexlen + lenindex 1108 1109 if nval != ncod: 1110 scod = (size - ncod * sval) / (nval - ncod) 1111 olight = scod / sval 1112 else: 1113 olight = None 1114 return {'total values': nval, 'mean size': round(sval, 3), 1115 'unique values': ncod, 'mean coding size': round(scod, 3), 1116 'unicity level': round(ncod / nval, 3), 1117 'optimize level': round(size / fullsize, 3), 1118 'object lightness': round(olight, 3), 1119 'maxgain': round((nval - ncod) / nval, 3), 1120 'gain': round((fullsize - size) / fullsize, 3)} 1121 1122 def _add_child(self, field, childs): 1123 ''' add derived or coupled fields in the childs list''' 1124 for rel in field.list_c_derived + field.list_coupled: 1125 child = rel.relation[1] 1126 if not child in childs and not child.category == UNIQUE: 1127 childs.append(child) 1128 if not child.category in (COUPLED, UNIQUE): 1129 self._add_child(child, childs) 1130 1131 def _mixte_dims(self, partition, partitions): 1132 '''return dict with dimensions associated to each mixte field''' 1133 dic_mixte = {} 1134 for part in partitions: 1135 not_part = [fld for fld in part if not fld in partition] 1136 if len(not_part) == 1 and len(partition) > len(part) > 1: 1137 sub_part = [fld for fld in partition if not fld in part] 1138 if min(self.get_relation(not_part[0], fld).typecoupl == 'derived' 1139 for fld in sub_part) is True: 1140 dic_mixte[not_part[0]] = sub_part 1141 return dic_mixte 1142 1143 1144class Util: 1145 ''' common functions for analysis package''' 1146 1147 @staticmethod 1148 def view(field_struc, mode, ana=None): 1149 ''' return a representation of a AnaDfields structure (field, id, index). 1150 1151 *Parameters* 1152 1153 - **mode** : str - AnaDfield representation ('field', 'id', 'index') 1154 - **field_struc** : list or dict - structure to represent 1155 - **ana** : AnaDataset (default None) - to convert string or index in AnaDfield 1156 ''' 1157 1158 if mode is None or not field_struc: 1159 return field_struc 1160 if isinstance(field_struc, dict): 1161 return {key: Util.view(val, mode=mode, ana=ana) 1162 for key, val in field_struc.items()} 1163 if isinstance(field_struc, list): 1164 return [Util.view(val, mode=mode, ana=ana) for val in field_struc] 1165 if not isinstance(field_struc, AnaDfield) and mode != 'id': 1166 return Util.view(ana.dfield(field_struc), mode=mode) 1167 return field_struc if mode == 'field' else ( 1168 field_struc.index if mode == 'index' else field_struc.idfield) 1169 1170 @staticmethod 1171 def reduce_dic(obj, notempty=False): 1172 '''return a dict without None values''' 1173 if isinstance(obj, dict): 1174 return {key: Util.reduce_dic(val) for key, val in obj.items() 1175 if not val is None and (not notempty or val)} 1176 if isinstance(obj, list): 1177 return [Util.reduce_dic(val) for val in obj] 1178 return obj 1179 1180 @staticmethod 1181 def clean_dic(obj, old, new): 1182 '''return a dict or list with updated strings by replacing "old" substring 1183 with "new" substring''' 1184 if isinstance(obj, dict): 1185 return {Util.clean_dic(key, old, new): Util.clean_dic(val, old, new) 1186 for key, val in obj.items()} 1187 if isinstance(obj, str): 1188 return obj.replace(old, new) 1189 if isinstance(obj, list): 1190 return [Util.clean_dic(val, old, new) for val in obj] 1191 return obj 1192 1193 @staticmethod 1194 def filter_dic(obj, keys): 1195 '''return extract of a list of dict or of a dict 1196 1197 *Parameters* 1198 1199 - **keys** : string, list or tuple - list of keys or single key to return 1200 if 'all' or None, all keys are returned 1201 if list, only keys in list are returned 1202 if string, only values associated to the string(key) are returned''' 1203 if not keys or keys == 'all': 1204 return obj 1205 if isinstance(obj, list): 1206 return [Util.filter_dic(dic, keys) for dic in obj] 1207 if isinstance(keys, str) and isinstance(obj, dict): 1208 return obj.get(keys, None) 1209 if isinstance(keys, (list, tuple)) and isinstance(obj, dict): 1210 return {key: val for key, val in obj.items() if key in keys} 1211 return obj 1212 1213 1214class AnaError(Exception): 1215 ''' Analysis Exception'''
79class AnaField: 80 '''This class analyses field entities. 81 82 *Attributes* 83 84 - **idfield** : string - name or Id of the field 85 - **lencodec**: integer - codec length 86 - **mincodec**: integer - minimal codec length 87 - **maxcodec**: integer - minimal codec length 88 - **hashf**: integer - hash value to identify modifications 89 90 *characteristic (@property)* 91 92 - `iscomplete` 93 - `ratecodec` 94 - `dmincodec` 95 - `dmaxcodec` 96 - `rancodec` 97 - `typecodec` 98 99 *instance methods* 100 101 - `to_dict` 102 103 ''' 104 105 def __init__(self, idfield, lencodec=None, mincodec=None, maxcodec=None, hashf=None): 106 '''Creation mode : 107 - single dict attribute where keys are attributes name, 108 - single AnaField attribute to make a copy 109 - multiple attributes 110 111 *Parameters (multiple attributes)* 112 113 - **idfield** : string or integer - Id of the Field 114 - **lencodec** : integer (default None) - length of the codec 115 - **mincodec** : integer (default None) - number of different values 116 - **maxcodec** : integer (default None) - length of the field 117 - **hashf** : string (default None) - update identifier 118 119 *example* 120 121 AnaField is created with a dict 122 >>> AnaField(Cfield([1,2,3,3]).to_analysis).to_dict() 123 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 124 >>> AnaField({'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}) 125 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 126 127 AnaField is created with parameters 128 >>> AnaField(lencodec=4, mincodec=3, maxcodec=4).to_dict() 129 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 130 >>> AnaField(4, 3, 4).to_dict() 131 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 132 ''' 133 if isinstance(idfield, dict): 134 self.idfield = idfield.get(IDFIELD, None) 135 self.lencodec = idfield.get(LENCODEC, None) 136 self.mincodec = idfield.get(MINCODEC, None) 137 self.maxcodec = idfield.get(MAXCODEC, None) 138 self.hashf = idfield.get(HASHF, None) 139 return 140 if isinstance(idfield, (AnaField, AnaDfield)): 141 self.idfield = idfield.idfield 142 self.lencodec = idfield.lencodec 143 self.mincodec = idfield.mincodec 144 self.maxcodec = idfield.maxcodec 145 self.hashf = idfield.hashf 146 return 147 if not lencodec or not isinstance(lencodec, int): 148 raise AnaError("lencodec is not correct") 149 self.idfield = idfield 150 self.lencodec = lencodec 151 self.mincodec = mincodec 152 self.maxcodec = maxcodec 153 self.hashf = hashf 154 155 def __len__(self): 156 '''length of the field (maxcodec)''' 157 return self.maxcodec if self.maxcodec else self.lencodec 158 159 def __repr__(self): 160 '''representation of the field (class name + idfield)''' 161 return self.__class__.__name__ + '(' + str(self.idfield) + ')' 162 163 def __eq__(self, other): 164 ''' equal if class and attributes are equal''' 165 return self.__class__ .__name__ == other.__class__.__name__ and \ 166 self.idfield == other.idfield and self.lencodec == other.lencodec and \ 167 self.mincodec == other.mincodec and self.maxcodec == other.maxcodec and \ 168 self.hashf == other.hashf 169 170 def __lt__(self, other): 171 ''' return a comparison between hash value''' 172 return hash(self) < hash(other) 173 174 def __hash__(self): 175 '''return hash value (sum of attributes hash)''' 176 return hash(self.idfield) + hash(self.lencodec) + hash(self.mincodec) \ 177 + hash(self.maxcodec) + hash(self.hashf) 178 179 def __str__(self): 180 '''json-text build with the attributes dict''' 181 return json.dumps(self.to_dict(idfield=True)) 182 183 def __copy__(self): 184 ''' Copy all the attributes ''' 185 return self.__class__(self) 186 187 def to_dict(self, full=False, idfield=False, notnone=True): 188 '''return a dict with field attributes. 189 190 *Parameters* 191 192 - **full** : boolean (default False) - if True, all the attributes are included 193 - **idfield** : boolean (default False) - if True, idfield is included 194 - **notnone** : boolean (default True) - if True, None values are not included 195 ''' 196 dic = {LENCODEC: self.lencodec, MINCODEC: self.mincodec, 197 MAXCODEC: self.maxcodec} 198 if idfield or full: 199 dic[IDFIELD] = self.idfield 200 if full: 201 dic |= {RATECODEC: self.ratecodec, DMINCODEC: self.dmincodec, 202 DMAXCODEC: self.dmaxcodec, RANCODEC: self.rancodec, 203 TYPECODEC: self.typecodec} 204 if notnone: 205 return Util.reduce_dic(dic) 206 return dic 207 208 @property 209 def iscomplete(self): 210 '''return boolean indicator : True if all attributes are present''' 211 return not self.maxcodec is None and not self.mincodec is None 212 213 @property 214 def ratecodec(self): 215 '''return float ratecodec indicator''' 216 if self.iscomplete and self.maxcodec - self.mincodec: 217 return (self.maxcodec - self.lencodec) / (self.maxcodec - self.mincodec) 218 return None 219 220 @property 221 def dmincodec(self): 222 '''return integer dmincodec indicator''' 223 return self.lencodec - self.mincodec if self.iscomplete else None 224 225 @property 226 def dmaxcodec(self): 227 '''return integer dmaxcodec indicator''' 228 return self.maxcodec - self.lencodec if self.iscomplete else None 229 230 @property 231 def rancodec(self): 232 '''return integer rancodec indicator''' 233 return self.maxcodec - self.mincodec if self.iscomplete else None 234 235 @property 236 def typecodec(self): 237 '''return string typecodec indicator 238 (null, unique, complete, full, default, mixed) 239 ''' 240 if self.maxcodec is None or self.mincodec is None: 241 return None 242 if self.maxcodec == 0: 243 return NULL 244 if self.lencodec == 1: 245 return UNIQUE 246 if self.mincodec == self.maxcodec: 247 return COMPLETE 248 if self.lencodec == self.maxcodec: 249 return FULL 250 if self.lencodec == self.mincodec: 251 return DEFAULT 252 return MIXED
This class analyses field entities.
Attributes
- idfield : string - name or Id of the field
- lencodec: integer - codec length
- mincodec: integer - minimal codec length
- maxcodec: integer - minimal codec length
- hashf: integer - hash value to identify modifications
characteristic (@property)
instance methods
105 def __init__(self, idfield, lencodec=None, mincodec=None, maxcodec=None, hashf=None): 106 '''Creation mode : 107 - single dict attribute where keys are attributes name, 108 - single AnaField attribute to make a copy 109 - multiple attributes 110 111 *Parameters (multiple attributes)* 112 113 - **idfield** : string or integer - Id of the Field 114 - **lencodec** : integer (default None) - length of the codec 115 - **mincodec** : integer (default None) - number of different values 116 - **maxcodec** : integer (default None) - length of the field 117 - **hashf** : string (default None) - update identifier 118 119 *example* 120 121 AnaField is created with a dict 122 >>> AnaField(Cfield([1,2,3,3]).to_analysis).to_dict() 123 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 124 >>> AnaField({'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}) 125 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 126 127 AnaField is created with parameters 128 >>> AnaField(lencodec=4, mincodec=3, maxcodec=4).to_dict() 129 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 130 >>> AnaField(4, 3, 4).to_dict() 131 {'lencodec': 4, 'mincodec': 3, 'maxcodec': 4} 132 ''' 133 if isinstance(idfield, dict): 134 self.idfield = idfield.get(IDFIELD, None) 135 self.lencodec = idfield.get(LENCODEC, None) 136 self.mincodec = idfield.get(MINCODEC, None) 137 self.maxcodec = idfield.get(MAXCODEC, None) 138 self.hashf = idfield.get(HASHF, None) 139 return 140 if isinstance(idfield, (AnaField, AnaDfield)): 141 self.idfield = idfield.idfield 142 self.lencodec = idfield.lencodec 143 self.mincodec = idfield.mincodec 144 self.maxcodec = idfield.maxcodec 145 self.hashf = idfield.hashf 146 return 147 if not lencodec or not isinstance(lencodec, int): 148 raise AnaError("lencodec is not correct") 149 self.idfield = idfield 150 self.lencodec = lencodec 151 self.mincodec = mincodec 152 self.maxcodec = maxcodec 153 self.hashf = hashf
Creation mode :
- single dict attribute where keys are attributes name,
- single AnaField attribute to make a copy
- multiple attributes
Parameters (multiple attributes)
- idfield : string or integer - Id of the Field
- lencodec : integer (default None) - length of the codec
- mincodec : integer (default None) - number of different values
- maxcodec : integer (default None) - length of the field
- hashf : string (default None) - update identifier
example
AnaField is created with a dict
>>> AnaField(Cfield([1,2,3,3]).to_analysis).to_dict()
{'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
>>> AnaField({'lencodec': 4, 'mincodec': 3, 'maxcodec': 4})
{'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
AnaField is created with parameters
>>> AnaField(lencodec=4, mincodec=3, maxcodec=4).to_dict()
{'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
>>> AnaField(4, 3, 4).to_dict()
{'lencodec': 4, 'mincodec': 3, 'maxcodec': 4}
187 def to_dict(self, full=False, idfield=False, notnone=True): 188 '''return a dict with field attributes. 189 190 *Parameters* 191 192 - **full** : boolean (default False) - if True, all the attributes are included 193 - **idfield** : boolean (default False) - if True, idfield is included 194 - **notnone** : boolean (default True) - if True, None values are not included 195 ''' 196 dic = {LENCODEC: self.lencodec, MINCODEC: self.mincodec, 197 MAXCODEC: self.maxcodec} 198 if idfield or full: 199 dic[IDFIELD] = self.idfield 200 if full: 201 dic |= {RATECODEC: self.ratecodec, DMINCODEC: self.dmincodec, 202 DMAXCODEC: self.dmaxcodec, RANCODEC: self.rancodec, 203 TYPECODEC: self.typecodec} 204 if notnone: 205 return Util.reduce_dic(dic) 206 return dic
return a dict with field attributes.
Parameters
- full : boolean (default False) - if True, all the attributes are included
- idfield : boolean (default False) - if True, idfield is included
- notnone : boolean (default True) - if True, None values are not included
208 @property 209 def iscomplete(self): 210 '''return boolean indicator : True if all attributes are present''' 211 return not self.maxcodec is None and not self.mincodec is None
return boolean indicator : True if all attributes are present
213 @property 214 def ratecodec(self): 215 '''return float ratecodec indicator''' 216 if self.iscomplete and self.maxcodec - self.mincodec: 217 return (self.maxcodec - self.lencodec) / (self.maxcodec - self.mincodec) 218 return None
return float ratecodec indicator
220 @property 221 def dmincodec(self): 222 '''return integer dmincodec indicator''' 223 return self.lencodec - self.mincodec if self.iscomplete else None
return integer dmincodec indicator
225 @property 226 def dmaxcodec(self): 227 '''return integer dmaxcodec indicator''' 228 return self.maxcodec - self.lencodec if self.iscomplete else None
return integer dmaxcodec indicator
230 @property 231 def rancodec(self): 232 '''return integer rancodec indicator''' 233 return self.maxcodec - self.mincodec if self.iscomplete else None
return integer rancodec indicator
235 @property 236 def typecodec(self): 237 '''return string typecodec indicator 238 (null, unique, complete, full, default, mixed) 239 ''' 240 if self.maxcodec is None or self.mincodec is None: 241 return None 242 if self.maxcodec == 0: 243 return NULL 244 if self.lencodec == 1: 245 return UNIQUE 246 if self.mincodec == self.maxcodec: 247 return COMPLETE 248 if self.lencodec == self.maxcodec: 249 return FULL 250 if self.lencodec == self.mincodec: 251 return DEFAULT 252 return MIXED
return string typecodec indicator (null, unique, complete, full, default, mixed)
255class AnaRelation: 256 '''This class analyses relationship between two fields 257 258 *Attributes* : 259 260 - **relation** : List of the two fields involved in the relationship 261 - **dist** : value of the relationship 262 - **distrib** : boolean True if values are distributed 263 - **hashr**: integer - hash value to identify update 264 265 *global (@property)* 266 267 - `id_relation` 268 - `index_relation` 269 - `parent_child` 270 - `typecoupl` 271 272 *characteristic (@property)* 273 274 - `dmax` 275 - `dmin` 276 - `diff` 277 - `dran` 278 - `distomin` 279 - `distomax` 280 - `distance` 281 - `ratecpl` 282 - `rateder` 283 284 *instance methods* 285 286 - `to_dict` 287 ''' 288 289 def __init__(self, relation, dists, hashr=None): 290 '''Constructor of the relationship : 291 292 *Parameters* 293 294 - **relation** : List of the two fields involved in the relationship 295 - **dists** : dist value or list of dist value and distrib boolean 296 - **distrib** : boolean True if values are distributed 297 - **hashr**: integer - hash value to identify update 298 ''' 299 self.relation = relation 300 if isinstance(dists, list): 301 self.dist = dists[0] 302 self.distrib = dists[1] 303 else: 304 self.dist = dists 305 self.distrib = None 306 self.hashr = hashr 307 308 def __repr__(self): 309 '''representation of the field (class name + idfield)''' 310 return self.__class__.__name__ + '(' + str(self.id_relation) + ')' 311 312 def __str__(self): 313 '''json-text build with the attributes dict''' 314 return json.dumps(self.to_dict(relation=True)) 315 316 def __eq__(self, other): 317 ''' equal if class and values are equal''' 318 return self.__class__ .__name__ == other.__class__.__name__ and \ 319 self.relation == other.relation and self.dist == other.dist and \ 320 self.hashr == other.hashr and self.distrib == other.distrib 321 322 def __hash__(self): 323 '''return hash value (sum of attributes hash)''' 324 return hash(self.relation[0]) + hash(self.relation[1]) + \ 325 hash(self.dist) + hash(self.hashr) + hash(self.distrib) 326 327 def to_dict(self, distances=False, full=False, mode='field', relation=False, 328 notnone=True, misc=False): 329 '''return a dict with AnaRelation attributes. 330 331 *Parameters* 332 333 - **distances** : boolean (default False) - if True, distances indicators are included 334 - **full** : boolean (default False) - if True, all the attributes are included 335 - **relation** : boolean (default False) - if True, idfield are included 336 - **notnone** : boolean (default True) - if True, None values are not included 337 - **mode** : str (default 'field') - AnaDfield representation ('field', 'id', 'index') 338 ''' 339 dic = {DIST: self.dist, TYPECOUPL: self.typecoupl, HASHR: self.hashr} 340 if relation or full: 341 dic[RELATION] = Util.view(self.relation, mode) 342 dic[PARENTCHILD] = self.parent_child 343 if distances or full: 344 dic |= {DISTANCE: self.distance, DISTOMIN: self.distomin, 345 DISTOMAX: self.distomax, DISTRIBUTED: self.distrib, 346 RATECPL: self.ratecpl, RATEDER: self.rateder} 347 if misc or full: 348 dic |= {DMAX: self.dmax, DMIN: self.dmin, 349 DIFF: self.diff, DRAN: self.dran} 350 if notnone: 351 return Util.reduce_dic(dic) 352 return dic 353 354 @property 355 def id_relation(self): 356 '''return a list with the id of the two fields involved''' 357 if self.relation: 358 return [fld.idfield for fld in self.relation] 359 return [] 360 361 @property 362 def parent_child(self): 363 '''returns the direction of the relationship (True if parent is first)''' 364 rel0 = self.relation[0] 365 rel1 = self.relation[1] 366 return (rel0.lencodec > rel1.lencodec or 367 (rel0.lencodec == rel1.lencodec and rel0.index < rel1.index)) 368 369 @property 370 def index_relation(self): 371 '''return a list with the index of the two fields involved''' 372 if self.relation: 373 return [fld.index for fld in self.relation] 374 return [] 375 376 @property 377 def dmax(self): 378 '''return integer dmax indicator''' 379 return self.relation[0].lencodec * self.relation[1].lencodec 380 381 @property 382 def dmin(self): 383 '''return integer dmin indicator''' 384 return max(self.relation[0].lencodec, self.relation[1].lencodec) 385 386 @property 387 def diff(self): 388 '''return integer diff indicator''' 389 return abs(self.relation[0].lencodec - self.relation[1].lencodec) 390 391 @property 392 def dran(self): 393 '''return integer dran indicator''' 394 return self.dmax - self.dmin 395 396 @property 397 def distomin(self): 398 '''return integer distomin indicator''' 399 return self.dist - self.dmin 400 401 @property 402 def distomax(self): 403 '''return integer distomax indicator''' 404 return self.dmax - self.dist 405 406 @property 407 def distance(self): 408 '''return integer distance indicator''' 409 return self.distomin + self.diff 410 411 @property 412 def ratecpl(self): 413 '''return float ratecpl indicator''' 414 disdis = self.distance + self.distomax 415 return 0 if disdis == 0 else self.distance / disdis 416 417 @property 418 def rateder(self): 419 '''return float rateder indicator''' 420 return 0 if self.dran == 0 else self.distomin / self.dran 421 422 @property 423 def typecoupl(self): 424 '''return relationship type (coupled, derived, crossed, linked)''' 425 if self.distance == 0: 426 return COUPLED 427 if self.distomin == 0: 428 return DERIVED 429 if self.distomax == 0: 430 return CROSSED 431 return LINKED
This class analyses relationship between two fields
Attributes :
- relation : List of the two fields involved in the relationship
- dist : value of the relationship
- distrib : boolean True if values are distributed
- hashr: integer - hash value to identify update
global (@property)
characteristic (@property)
instance methods
289 def __init__(self, relation, dists, hashr=None): 290 '''Constructor of the relationship : 291 292 *Parameters* 293 294 - **relation** : List of the two fields involved in the relationship 295 - **dists** : dist value or list of dist value and distrib boolean 296 - **distrib** : boolean True if values are distributed 297 - **hashr**: integer - hash value to identify update 298 ''' 299 self.relation = relation 300 if isinstance(dists, list): 301 self.dist = dists[0] 302 self.distrib = dists[1] 303 else: 304 self.dist = dists 305 self.distrib = None 306 self.hashr = hashr
Constructor of the relationship :
Parameters
- relation : List of the two fields involved in the relationship
- dists : dist value or list of dist value and distrib boolean
- distrib : boolean True if values are distributed
- hashr: integer - hash value to identify update
327 def to_dict(self, distances=False, full=False, mode='field', relation=False, 328 notnone=True, misc=False): 329 '''return a dict with AnaRelation attributes. 330 331 *Parameters* 332 333 - **distances** : boolean (default False) - if True, distances indicators are included 334 - **full** : boolean (default False) - if True, all the attributes are included 335 - **relation** : boolean (default False) - if True, idfield are included 336 - **notnone** : boolean (default True) - if True, None values are not included 337 - **mode** : str (default 'field') - AnaDfield representation ('field', 'id', 'index') 338 ''' 339 dic = {DIST: self.dist, TYPECOUPL: self.typecoupl, HASHR: self.hashr} 340 if relation or full: 341 dic[RELATION] = Util.view(self.relation, mode) 342 dic[PARENTCHILD] = self.parent_child 343 if distances or full: 344 dic |= {DISTANCE: self.distance, DISTOMIN: self.distomin, 345 DISTOMAX: self.distomax, DISTRIBUTED: self.distrib, 346 RATECPL: self.ratecpl, RATEDER: self.rateder} 347 if misc or full: 348 dic |= {DMAX: self.dmax, DMIN: self.dmin, 349 DIFF: self.diff, DRAN: self.dran} 350 if notnone: 351 return Util.reduce_dic(dic) 352 return dic
return a dict with AnaRelation attributes.
Parameters
- distances : boolean (default False) - if True, distances indicators are included
- full : boolean (default False) - if True, all the attributes are included
- relation : boolean (default False) - if True, idfield are included
- notnone : boolean (default True) - if True, None values are not included
- mode : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
354 @property 355 def id_relation(self): 356 '''return a list with the id of the two fields involved''' 357 if self.relation: 358 return [fld.idfield for fld in self.relation] 359 return []
return a list with the id of the two fields involved
361 @property 362 def parent_child(self): 363 '''returns the direction of the relationship (True if parent is first)''' 364 rel0 = self.relation[0] 365 rel1 = self.relation[1] 366 return (rel0.lencodec > rel1.lencodec or 367 (rel0.lencodec == rel1.lencodec and rel0.index < rel1.index))
returns the direction of the relationship (True if parent is first)
369 @property 370 def index_relation(self): 371 '''return a list with the index of the two fields involved''' 372 if self.relation: 373 return [fld.index for fld in self.relation] 374 return []
return a list with the index of the two fields involved
376 @property 377 def dmax(self): 378 '''return integer dmax indicator''' 379 return self.relation[0].lencodec * self.relation[1].lencodec
return integer dmax indicator
381 @property 382 def dmin(self): 383 '''return integer dmin indicator''' 384 return max(self.relation[0].lencodec, self.relation[1].lencodec)
return integer dmin indicator
386 @property 387 def diff(self): 388 '''return integer diff indicator''' 389 return abs(self.relation[0].lencodec - self.relation[1].lencodec)
return integer diff indicator
391 @property 392 def dran(self): 393 '''return integer dran indicator''' 394 return self.dmax - self.dmin
return integer dran indicator
396 @property 397 def distomin(self): 398 '''return integer distomin indicator''' 399 return self.dist - self.dmin
return integer distomin indicator
401 @property 402 def distomax(self): 403 '''return integer distomax indicator''' 404 return self.dmax - self.dist
return integer distomax indicator
406 @property 407 def distance(self): 408 '''return integer distance indicator''' 409 return self.distomin + self.diff
return integer distance indicator
411 @property 412 def ratecpl(self): 413 '''return float ratecpl indicator''' 414 disdis = self.distance + self.distomax 415 return 0 if disdis == 0 else self.distance / disdis
return float ratecpl indicator
417 @property 418 def rateder(self): 419 '''return float rateder indicator''' 420 return 0 if self.dran == 0 else self.distomin / self.dran
return float rateder indicator
422 @property 423 def typecoupl(self): 424 '''return relationship type (coupled, derived, crossed, linked)''' 425 if self.distance == 0: 426 return COUPLED 427 if self.distomin == 0: 428 return DERIVED 429 if self.distomax == 0: 430 return CROSSED 431 return LINKED
return relationship type (coupled, derived, crossed, linked)
434class AnaDfield(AnaField): 435 '''This class analyses structure and relationships of fields inside a dataset 436 437 *Attributes* : 438 439 - **dataset** : AnaDataset object where AnaDfield is included 440 - **AnaField attributes** : inheritance of AnaField object 441 442 *relationship (@property)* 443 444 - `list_relations` 445 - `list_p_derived` 446 - `list_c_derived` 447 - `list_coupled` 448 449 *field (@property)* 450 451 - `fields` 452 - `p_derived` 453 - `p_distance` 454 - `p_distomin` 455 456 *global (@property)* 457 458 - `index` 459 - `dist_root` 460 - `category` 461 462 *global (instance methods)* 463 464 - `ascendants` 465 - `to_dict` 466 - `view` 467 468 *other instance methods* 469 470 - `dic_inner_node` 471 ''' 472 def __new__(cls, other, dataset=None): 473 '''initialization of attributes from "other"''' 474 if isinstance(other, AnaDfield): 475 new = AnaDfield.__copy__(other) 476 return new 477 if isinstance(other, AnaField): 478 new = AnaField.__copy__(other) 479 new.__class__ = AnaDfield 480 return new 481 return object.__new__(cls) 482 483 def __init__(self, other, dataset): 484 '''AnaDfield is created by adding a AnaDataset link to an AnaField object. 485 486 *Parameters* 487 488 - **other** : AnaField or AnaDfield to initialize attributes 489 - **dataset** : AnaDataset which includes the AnaDfield 490 ''' 491 self.dataset = dataset 492 493 def __copy__(self): 494 ''' Copy all the data ''' 495 return self.__class__(AnaField(self), self.dataset) 496 497 def __lt__(self, other): 498 ''' return a comparison between field index''' 499 return self.index < other.index 500 501 @property 502 def index(self): 503 '''return the row of the field in the AnaDataset''' 504 if self == self.dataset.root: 505 return -1 506 return self.dataset.fields.index(self) 507 508 @property 509 def fields(self): 510 '''return the list of the fields included in the AnaDataset''' 511 return self.dataset.fields 512 513 @property 514 def list_relations(self): 515 '''return the list of the relations with the AnaDfield''' 516 return list(self.dataset.relations[self].values()) 517 518 @property 519 def list_p_derived(self): 520 '''return the list of the derived relations with the parents of AnaDfield''' 521 return [rel for rel in self.list_relations if rel.typecoupl == DERIVED 522 and not rel.parent_child] 523 524 @property 525 def list_c_derived(self): 526 '''return the list of the derived relations with the childs of AnaDfield''' 527 return [rel for rel in self.list_relations if rel.typecoupl == DERIVED 528 and rel.parent_child 529 and rel.relation[1].category != UNIQUE] 530 531 @property 532 def list_coupled(self): 533 '''return the list of the coupled relations with the AnaDfield''' 534 return [rel for rel in self.list_relations if rel.typecoupl == COUPLED] 535 536 @property 537 def dist_root(self): 538 '''return the distance to the root field''' 539 return len(self.dataset) - self.lencodec 540 541 @property 542 def category(self): 543 '''return AnaDfield category (unique, rooted, coupled, derived, mixed)''' 544 if self.typecodec == UNIQUE: 545 return UNIQUE 546 if self.typecodec in (COMPLETE, FULL): 547 return ROOTED 548 if COUPLED in [rel.typecoupl for rel in self.list_relations 549 if not rel.parent_child]: 550 return COUPLED 551 if not self.list_c_derived: 552 return DERIVED 553 return MIXED 554 555 @property 556 def p_derived(self): 557 '''return the first derived or coupled parent of the AnaDfield''' 558 if self.category in (UNIQUE, ROOTED): 559 return self.dataset.root 560 if self.category == COUPLED: 561 return [rel.relation[1] for rel in self.list_coupled 562 if not rel.relation[1].category == COUPLED][0] 563 if not self.list_p_derived: 564 return self.dataset.root 565 distance_min = min(rel.distance for rel in self.list_p_derived) 566 for rel in self.list_p_derived: 567 if rel.distance == distance_min: 568 if rel.relation[1].category == ROOTED: 569 return self.dataset.root 570 if rel.relation[1].category == MIXED: 571 return rel.relation[1] 572 return self.dataset.root 573 574 @property 575 def p_distance(self): 576 '''return the first parent with minimal distance of the AnaDfield''' 577 return self._p_min_dist() 578 579 @property 580 def p_distomin(self): 581 '''return the first parent with minimal distomin of the AnaDfield''' 582 return self._p_min_dist(False) 583 584 def _p_min_dist(self, distance=True): 585 '''return the parent with minimal distance of the AnaDfield''' 586 if self.category == UNIQUE: 587 return self.dataset.root 588 if distance: 589 dist_up = [rel.distance for rel in self.list_relations if 590 not rel.parent_child] 591 else: 592 dist_up = [rel.distomin for rel in self.list_relations if 593 not rel.parent_child] 594 if not dist_up or min(dist_up) == self.dist_root: 595 return self.dataset.root 596 dist_min = min(dist_up) 597 if distance: 598 list_dmin = [rel.relation[1] for rel in self.list_relations 599 if rel.distance == dist_min] 600 else: 601 list_dmin = [rel.relation[1] for rel in self.list_relations 602 if rel.distomin == dist_min] 603 max_lencodec = max(fld.lencodec for fld in list_dmin) 604 return [fld for fld in list_dmin if fld.lencodec == max_lencodec][0] 605 606 def to_dict(self, mode='id'): 607 '''return a dict with field attributes. 608 609 *Parameters* 610 611 - **mode** : str (default 'id') - AnaDfield representation ('field', 'id', 'index') 612 ''' 613 dic = super().to_dict(full=True, idfield=False, notnone=False) 614 dic[DISTROOT] = self.dist_root 615 dic[NUM] = self.index 616 dic[CATEGORY] = self.category 617 dic[PDISTANCE] = self.p_distance.view(mode) 618 dic[PDISTOMIN] = self.p_distomin.view(mode) 619 dic[PDERIVED] = self.p_derived.view(mode) 620 return dic 621 622 def view(self, mode='field'): 623 ''' return a representation of the AnaDfield 624 625 *Parameters* 626 627 - **mode** : str (default 'field') - AnaDfield representation ('field', 'id', 'index') 628 ''' 629 return Util.view(self, mode) 630 631 def ascendants(self, typeparent='derived', mode='field'): 632 ''' return the list of the AnaDfield's ascendants in the family tree up to 633 the root AnaDfield. 634 635 *Parameters* 636 637 - **typeparent** : str (default 'derived') - 'derived', 'distance' or 'distomin' 638 - **mode** : str (default 'field') - AnaDfield representation 639 ('field', 'id', 'index') 640 641 *Returns* : list of parents from closest to the most distant. Parents 642 are represented with index, idfield, or object 643 ''' 644 parent = self 645 listparent = [] 646 while parent != self.dataset.root: 647 if typeparent == 'derived': 648 parent = parent.p_derived 649 elif typeparent == 'distance': 650 parent = parent.p_distance 651 else: 652 parent = parent.p_distomin 653 if parent != self.dataset.root: 654 listparent.append(parent) 655 return Util.view(listparent, mode) 656 657 def dic_inner_node(self, mode, lname): 658 '''return a child AnaDfield tree. 659 660 *Parameters* 661 662 - **lname** : integer - maximal length of the names 663 - **mode** : string (default 'derived') - kind of tree : 664 'derived' : derived tree 665 'distance': min distance tree 666 'distomin': min distomin tree 667 668 *Returns* : dict where key is a AnaDfield and value is the list of 669 the childs "name ( dist - lencodec)". 670 ''' 671 adding = '' 672 if mode == 'distance': 673 rel_parent = self.dataset.get_relation(self, self.p_distance) 674 adding = str(rel_parent.distance) + ' - ' 675 elif mode == 'distomin': 676 rel_parent = self.dataset.get_relation(self, self.p_distomin) 677 adding = str(rel_parent.distomin) + ' - ' 678 elif mode == 'derived': 679 rel_parent = self.dataset.get_relation(self, self.p_derived) 680 adding = str(rel_parent.distance) + ' - ' 681 adding += str(self.lencodec) 682 name = str(self.idfield)[:lname] + ' (' + adding + ')' 683 lis = [name.replace(' ', '*').replace("'", '*')] 684 if mode == 'derived': 685 childs = [] 686 if not self.category in (ROOTED, COUPLED, UNIQUE): 687 for rel in self.list_coupled: 688 lis.append(rel.relation[1].dic_inner_node(mode, lname)) 689 if not self.category in (ROOTED, UNIQUE): 690 childs = [rel.relation[1] for rel in self.list_relations 691 if rel.relation[1].p_derived == self and 692 rel.relation[1].category != COUPLED] 693 if mode == 'distomin': 694 childs = [rel.relation[1] for rel in self.list_relations 695 if rel.relation[1].p_distomin == self] 696 if mode == 'distance': 697 childs = [rel.relation[1] for rel in self.list_relations 698 if rel.relation[1].p_distance == self] 699 for fld in childs: 700 lis.append(fld.dic_inner_node(mode, lname)) 701 return {str(self.index).ljust(2, '*'): lis}
This class analyses structure and relationships of fields inside a dataset
Attributes :
- dataset : AnaDataset object where AnaDfield is included
- AnaField attributes : inheritance of AnaField object
relationship (@property)
field (@property)
global (@property)
global (instance methods)
other instance methods
483 def __init__(self, other, dataset): 484 '''AnaDfield is created by adding a AnaDataset link to an AnaField object. 485 486 *Parameters* 487 488 - **other** : AnaField or AnaDfield to initialize attributes 489 - **dataset** : AnaDataset which includes the AnaDfield 490 ''' 491 self.dataset = dataset
AnaDfield is created by adding a AnaDataset link to an AnaField object.
Parameters
- other : AnaField or AnaDfield to initialize attributes
- dataset : AnaDataset which includes the AnaDfield
501 @property 502 def index(self): 503 '''return the row of the field in the AnaDataset''' 504 if self == self.dataset.root: 505 return -1 506 return self.dataset.fields.index(self)
return the row of the field in the AnaDataset
508 @property 509 def fields(self): 510 '''return the list of the fields included in the AnaDataset''' 511 return self.dataset.fields
return the list of the fields included in the AnaDataset
513 @property 514 def list_relations(self): 515 '''return the list of the relations with the AnaDfield''' 516 return list(self.dataset.relations[self].values())
return the list of the relations with the AnaDfield
518 @property 519 def list_p_derived(self): 520 '''return the list of the derived relations with the parents of AnaDfield''' 521 return [rel for rel in self.list_relations if rel.typecoupl == DERIVED 522 and not rel.parent_child]
return the list of the derived relations with the parents of AnaDfield
524 @property 525 def list_c_derived(self): 526 '''return the list of the derived relations with the childs of AnaDfield''' 527 return [rel for rel in self.list_relations if rel.typecoupl == DERIVED 528 and rel.parent_child 529 and rel.relation[1].category != UNIQUE]
return the list of the derived relations with the childs of AnaDfield
531 @property 532 def list_coupled(self): 533 '''return the list of the coupled relations with the AnaDfield''' 534 return [rel for rel in self.list_relations if rel.typecoupl == COUPLED]
return the list of the coupled relations with the AnaDfield
536 @property 537 def dist_root(self): 538 '''return the distance to the root field''' 539 return len(self.dataset) - self.lencodec
return the distance to the root field
541 @property 542 def category(self): 543 '''return AnaDfield category (unique, rooted, coupled, derived, mixed)''' 544 if self.typecodec == UNIQUE: 545 return UNIQUE 546 if self.typecodec in (COMPLETE, FULL): 547 return ROOTED 548 if COUPLED in [rel.typecoupl for rel in self.list_relations 549 if not rel.parent_child]: 550 return COUPLED 551 if not self.list_c_derived: 552 return DERIVED 553 return MIXED
return AnaDfield category (unique, rooted, coupled, derived, mixed)
555 @property 556 def p_derived(self): 557 '''return the first derived or coupled parent of the AnaDfield''' 558 if self.category in (UNIQUE, ROOTED): 559 return self.dataset.root 560 if self.category == COUPLED: 561 return [rel.relation[1] for rel in self.list_coupled 562 if not rel.relation[1].category == COUPLED][0] 563 if not self.list_p_derived: 564 return self.dataset.root 565 distance_min = min(rel.distance for rel in self.list_p_derived) 566 for rel in self.list_p_derived: 567 if rel.distance == distance_min: 568 if rel.relation[1].category == ROOTED: 569 return self.dataset.root 570 if rel.relation[1].category == MIXED: 571 return rel.relation[1] 572 return self.dataset.root
return the first derived or coupled parent of the AnaDfield
574 @property 575 def p_distance(self): 576 '''return the first parent with minimal distance of the AnaDfield''' 577 return self._p_min_dist()
return the first parent with minimal distance of the AnaDfield
579 @property 580 def p_distomin(self): 581 '''return the first parent with minimal distomin of the AnaDfield''' 582 return self._p_min_dist(False)
return the first parent with minimal distomin of the AnaDfield
606 def to_dict(self, mode='id'): 607 '''return a dict with field attributes. 608 609 *Parameters* 610 611 - **mode** : str (default 'id') - AnaDfield representation ('field', 'id', 'index') 612 ''' 613 dic = super().to_dict(full=True, idfield=False, notnone=False) 614 dic[DISTROOT] = self.dist_root 615 dic[NUM] = self.index 616 dic[CATEGORY] = self.category 617 dic[PDISTANCE] = self.p_distance.view(mode) 618 dic[PDISTOMIN] = self.p_distomin.view(mode) 619 dic[PDERIVED] = self.p_derived.view(mode) 620 return dic
return a dict with field attributes.
Parameters
- mode : str (default 'id') - AnaDfield representation ('field', 'id', 'index')
622 def view(self, mode='field'): 623 ''' return a representation of the AnaDfield 624 625 *Parameters* 626 627 - **mode** : str (default 'field') - AnaDfield representation ('field', 'id', 'index') 628 ''' 629 return Util.view(self, mode)
return a representation of the AnaDfield
Parameters
- mode : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
631 def ascendants(self, typeparent='derived', mode='field'): 632 ''' return the list of the AnaDfield's ascendants in the family tree up to 633 the root AnaDfield. 634 635 *Parameters* 636 637 - **typeparent** : str (default 'derived') - 'derived', 'distance' or 'distomin' 638 - **mode** : str (default 'field') - AnaDfield representation 639 ('field', 'id', 'index') 640 641 *Returns* : list of parents from closest to the most distant. Parents 642 are represented with index, idfield, or object 643 ''' 644 parent = self 645 listparent = [] 646 while parent != self.dataset.root: 647 if typeparent == 'derived': 648 parent = parent.p_derived 649 elif typeparent == 'distance': 650 parent = parent.p_distance 651 else: 652 parent = parent.p_distomin 653 if parent != self.dataset.root: 654 listparent.append(parent) 655 return Util.view(listparent, mode)
return the list of the AnaDfield's ascendants in the family tree up to the root AnaDfield.
Parameters
- typeparent : str (default 'derived') - 'derived', 'distance' or 'distomin'
- mode : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
Returns : list of parents from closest to the most distant. Parents are represented with index, idfield, or object
657 def dic_inner_node(self, mode, lname): 658 '''return a child AnaDfield tree. 659 660 *Parameters* 661 662 - **lname** : integer - maximal length of the names 663 - **mode** : string (default 'derived') - kind of tree : 664 'derived' : derived tree 665 'distance': min distance tree 666 'distomin': min distomin tree 667 668 *Returns* : dict where key is a AnaDfield and value is the list of 669 the childs "name ( dist - lencodec)". 670 ''' 671 adding = '' 672 if mode == 'distance': 673 rel_parent = self.dataset.get_relation(self, self.p_distance) 674 adding = str(rel_parent.distance) + ' - ' 675 elif mode == 'distomin': 676 rel_parent = self.dataset.get_relation(self, self.p_distomin) 677 adding = str(rel_parent.distomin) + ' - ' 678 elif mode == 'derived': 679 rel_parent = self.dataset.get_relation(self, self.p_derived) 680 adding = str(rel_parent.distance) + ' - ' 681 adding += str(self.lencodec) 682 name = str(self.idfield)[:lname] + ' (' + adding + ')' 683 lis = [name.replace(' ', '*').replace("'", '*')] 684 if mode == 'derived': 685 childs = [] 686 if not self.category in (ROOTED, COUPLED, UNIQUE): 687 for rel in self.list_coupled: 688 lis.append(rel.relation[1].dic_inner_node(mode, lname)) 689 if not self.category in (ROOTED, UNIQUE): 690 childs = [rel.relation[1] for rel in self.list_relations 691 if rel.relation[1].p_derived == self and 692 rel.relation[1].category != COUPLED] 693 if mode == 'distomin': 694 childs = [rel.relation[1] for rel in self.list_relations 695 if rel.relation[1].p_distomin == self] 696 if mode == 'distance': 697 childs = [rel.relation[1] for rel in self.list_relations 698 if rel.relation[1].p_distance == self] 699 for fld in childs: 700 lis.append(fld.dic_inner_node(mode, lname)) 701 return {str(self.index).ljust(2, '*'): lis}
return a child AnaDfield tree.
Parameters
- lname : integer - maximal length of the names
- mode : string (default 'derived') - kind of tree : 'derived' : derived tree 'distance': min distance tree 'distomin': min distomin tree
Returns : dict where key is a AnaDfield and value is the list of the childs "name ( dist - lencodec)".
704class AnaDataset: 705 '''This class analyses the structure of a dataset. 706 707 *Attributes* : 708 709 - **iddataset** : string or integer - Id of the Dataset 710 - **fields** : list of the AnaDfields included 711 - **relations** : dict of the AnaRelations between two AnaDfields 712 - **hashd** : string - update identifier 713 714 *relationship (@property)* 715 716 - `ana_relations` 717 - `p_relations` 718 719 *field (@property)* 720 721 - `root` 722 - `primary` 723 - `secondary` 724 - `unique` 725 - `mixte` 726 - `variable` 727 728 *global (@property)* 729 730 - `category` 731 - `complete` 732 - `dimension` 733 734 *update (instance methods)* 735 736 - `set_relations` 737 738 *access (instance methods)* 739 740 - `get_relation` 741 - `dfield` 742 743 *synthesis (instance methods)* 744 745 - `tree` 746 - `to_dict` 747 - `indicator` 748 - `partitions` 749 - `field_partition` 750 - `relation_partition` 751 ''' 752 753 def __init__(self, fields=None, relations=None, iddataset=None, 754 leng=None, hashd=None): 755 '''Creation mode : 756 - single dict attribute where keys are attributes name, 757 - single AnaDataset attribute to make a copy 758 - multiple attributes 759 760 *Parameters (single dict)* 761 762 - **fields**: {'fields': list_of_dict, 'name': id_dataset, 763 'length': length, 'relations': dict_of_relations 764 where: 765 list_of_dict : {'id': id_field, 'lencodec': len_codec, 'mincodec': min_codec} 766 id_field: string - name of field 767 other_field: string - name of field 768 len_codec: int - length of the codec 769 min_codec: int - number of different codec values 770 id_dataset : name of the dataset 771 length: int - length of the dataset 772 dict_of_relations: {id_field : {other_field: dist} for all fields} 773 field: name of a field 774 field_other: name of another field 775 dist: integer (distance between the two fields) or 776 array (distance and boolean distributed) 777 778 *Parameters (multiple attributes)* 779 780 - **fields**: list_of_dict 781 - **iddataset** : string (default None) - id_dataset 782 - **relations** : dict (default None) - dict_of_relations 783 - **leng** : int (default None) - length 784 - **hashd** : string (default None) - update identifier 785 ''' 786 if isinstance(fields, AnaDataset): 787 self.iddataset = fields.iddataset 788 self.fields = fields.fields 789 self.relations = fields.relations 790 self.hashd = fields.hashd 791 return 792 if isinstance(fields, dict): 793 iddataset = fields.get(IDDATASET, None) 794 leng = fields.get(LENGTH, None) 795 relations = fields.get(RELATIONS, None) 796 hashd = fields.get(HASHD) 797 fields = fields.get(FIELDS, None) 798 self.iddataset = iddataset 799 self.fields = [AnaDfield(AnaField(field), self) 800 for field in fields] if fields else [] 801 if leng: 802 for fld in self.fields: 803 fld.maxcodec = leng 804 self.relations = {field: {} for field in self.fields} 805 if relations: 806 for fld, dic_relation in relations.items(): 807 self.set_relations(fld, dic_relation) 808 self.hashd = hashd 809 810 def __len__(self): 811 '''length of the AnaDataset (len of the AnaDfields included)''' 812 return max(len(fld) for fld in self.fields) 813 814 def __eq__(self, other): 815 ''' equal if class and values are equal''' 816 return self.__class__ .__name__ == other.__class__.__name__ and \ 817 self.fields == other.fields and self.relations == other.relations and \ 818 self.iddataset == other.iddataset and self.hashd == other.hashd 819 820 def __hash__(self): 821 '''return hash value (sum of attributes hash)''' 822 return hash(self.iddataset) + sum(hash(fld) for fld in self.fields) + \ 823 sum(hash(rel) for rel in self.relations) + hash(self.hashd) 824 825 @property 826 def category(self): 827 '''return a list of AnaDfield category (unique, rooted, coupled, derived, mixed)''' 828 return [fld.category for fld in self.fields] 829 830 @property 831 def ana_relations(self): 832 '''return the list of AnaRelation included''' 833 return [rel for fldrel in self.relations.values() for rel in fldrel.values()] 834 835 @property 836 def p_relations(self): 837 '''return the list of oriented AnaRelation (parent first, child second)''' 838 return [rel for rel in self.ana_relations if rel.parent_child] 839 840 @property 841 def root(self): 842 '''return the root AnaDfield''' 843 len_self = len(self) 844 return AnaDfield(AnaField(ROOT, len_self, len_self, len_self), self) 845 846 @property 847 def primary(self): 848 '''return the first partition of the partitions''' 849 return self.field_partition(mode='field')['primary'] 850 # part = self.partitions(mode='field', distributed=True) 851 # return part[0] if part else [] 852 853 @property 854 def complete(self): 855 '''return True if the dimension is not 0''' 856 return self.dimension > 0 857 858 @property 859 def dimension(self): 860 '''return the highest partition lenght''' 861 return len(self.primary) 862 863 @property 864 def secondary(self): 865 '''return the derived ou coupled fields from primary''' 866 return self.field_partition(mode='field')['secondary'] 867 868 @property 869 def unique(self): 870 '''return the unique fields''' 871 return [fld for fld in self.fields if fld.category == UNIQUE] 872 873 @property 874 def variable(self): 875 '''return the variable fields''' 876 return self.field_partition(mode='field')['variable'] 877 878 @property 879 def mixte(self): 880 '''return the variable fields''' 881 return self.field_partition(mode='field')['mixte'] 882 883 def set_relations(self, field, dic_relations): 884 '''Add relations in the AnaDataset from a dict. 885 886 *Parameters* 887 888 - **field** : AnaDfield, AnaField or str (idfield) - first relation AnaDfield 889 - **dic_relations** : dict - key is the second relation AnaDfield and 890 value is the dist value or teh list [dist, distrib] 891 ''' 892 fld = self.dfield(field) 893 for other, dist in dic_relations.items(): 894 oth = self.dfield(other) 895 self.relations[fld][oth] = AnaRelation([fld, oth], dist) 896 self.relations[oth][fld] = AnaRelation([oth, fld], dist) 897 898 def get_relation(self, fld1, fld2): 899 '''Return AnaRelation between fld1 and fld2. 900 901 *Parameters* 902 903 - **fld1** : AnaDfield, AnaField, int or str (idfield) - first relation AnaDfield 904 - **fld2** : AnaDfield, AnaField, int or str (idfield) - second relation AnaDfield 905 ''' 906 fl1 = self.dfield(fld1) 907 fl2 = self.dfield(fld2) 908 if self.root in [fl1, fl2]: 909 return AnaRelation([fl1, fl2], len(self)) 910 return self.relations[self.dfield(fld1)][self.dfield(fld2)] 911 912 def dfield(self, fld): 913 '''return the AnaDfield matching with fld. Fld is str, int, AnaDfield or AnaField''' 914 if fld in (-1, ROOT): 915 return self.root 916 if isinstance(fld, AnaDfield): 917 return fld 918 if isinstance(fld, int): 919 return self.fields[fld] 920 if isinstance(fld, str): 921 if fld in [dfld.idfield for dfld in self.fields]: 922 return [dfld for dfld in self.fields if dfld.idfield == fld][0] 923 return None 924 return AnaDfield(fld, self) 925 926 def tree(self, mode='derived', width=5, lname=20, string=True): 927 '''return a string with a tree of derived Field. 928 929 *Parameters* 930 931 - **lname** : integer (default 20) - length of the names 932 - **width** : integer (default 5) - length of the lines 933 - **string** : boolean (default True) - if True return str else return dict 934 - **mode** : string (default 'derived') - kind of tree : 935 'derived' : derived tree 936 'distance': min distance tree 937 'distomin': min distomin tree 938 ''' 939 lis = ['root-' + mode + '*(' + str(len(self)) + ')'] 940 if mode == 'distance': 941 childs = [fld for fld in self.fields if fld.p_distance == self.root] 942 elif mode == 'distomin': 943 childs = [fld for fld in self.fields if fld.p_distomin == self.root] 944 elif mode == 'derived': 945 childs = [fld for fld in self.fields if fld.p_derived == self.root] 946 for fld in childs: 947 lis.append(fld.dic_inner_node(mode, lname)) 948 tree = {str(-1).ljust(2, '*'): lis} 949 if string: 950 tre = pprint.pformat(tree, indent=0, width=width) 951 tre = tre.replace('---', ' - ') 952 tre = tre.replace(' ', ' ') 953 tre = tre.replace('*', ' ') 954 for car in ["'", "\"", "{", "[", "]", "}", ","]: 955 tre = tre.replace(car, "") 956 return tre 957 return Util.clean_dic(tree, '*', ' ') 958 959 def to_dict(self, mode='field', keys=None, relations=False): 960 '''return a dict with fields attributes and optionaly relations attributes. 961 962 *Parameters* 963 964 - **mode** : str (default 'field') - AnaDfield representation 965 ('field', 'id', 'index') 966 - **relations** : boolean (default: False) - if False return a list of fields, 967 if True return a dict '{"fields": <list of fields>, "relations": <list of relations>}' 968 - **keys** : string, list or tuple - list of keys or single key to return 969 if 'all' or None, all keys are returned 970 if list, only keys in list are returned 971 if string, only values associated to the string(key) are returned''' 972 fields = Util.filter_dic([fld.to_dict(mode=mode) 973 for fld in self.fields], keys) 974 leng = len(self.fields) 975 if not relations: 976 return fields 977 return {'fields': fields, 'relations': 978 [self.get_relation(i, j).to_dict(full=True, mode=mode) 979 for i in range(-1, leng) for j in range(i + 1, leng)]} 980 981 def partitions(self, mode='id', distributed=True): 982 '''return a list of available partitions (the first is highest). 983 984 *Parameters* 985 986 - **mode** : str (default 'id') - AnaDfield representation 987 ('field', 'id', 'index') 988 - **distributed** : boolean (default True) - Include only distributed fields 989 ''' 990 partit = [[fld] for fld in self.fields if fld.category == ROOTED] 991 crossed = [rel for rel in self.ana_relations if rel.typecoupl == CROSSED 992 and rel.parent_child 993 and rel.relation[0].category != COUPLED 994 and rel.relation[1].category != COUPLED] 995 if distributed: 996 crossed = [rel for rel in crossed if rel.distrib] 997 if crossed and len(crossed) == 1 and crossed[0].dist == len(self): 998 partit.insert(0, crossed[0].relation) 999 elif crossed: 1000 for repeat in list(range(len(crossed))): 1001 candidates = combinations(crossed, repeat + 1) 1002 for candidat in candidates: 1003 flds = list(set(rel.relation[i] 1004 for rel in candidat for i in [0, 1])) 1005 if (reduce(mul, [fld.lencodec for fld in flds]) == len(self) and 1006 len(candidat) == sum(range(len(flds))) and 1007 (not distributed or min(rel.distrib for rel in candidat))): 1008 partit.insert(0, flds) 1009 partit = [list(tup) for tup in 1010 sorted(sorted(list({tuple(sorted(prt)) for prt in partit})), 1011 key=len, reverse=True)] 1012 return Util.view(partit, mode) 1013 1014 def field_partition(self, mode='id', partition=None, distributed=True): 1015 '''return a partition dict with the list of primary, secondary, unique 1016 and variable fields. 1017 1018 *Parameters* 1019 1020 - **mode** : str (default 'id') - AnaDfield representation 1021 ('field', 'id', 'index') 1022 - **partition** : list of str, int, AnaDfield or AnaField(default None) - 1023 if None, partition is the first 1024 - **distributed** : boolean (default True) - Include only distributed fields 1025 ''' 1026 partitions = self.partitions(mode='field', distributed=distributed) 1027 if not partitions: 1028 return Util.view( 1029 {'primary': [], 'secondary': [ 1030 fld for fld in self.fields if fld.category != UNIQUE], 1031 'mixte': [], 'unique': [ 1032 fld for fld in self.fields if fld.category == UNIQUE], 1033 'variable': []}, mode) 1034 if not partition: 1035 partition = partitions[0] 1036 else: 1037 # partition = [self.dfield(fld) for fld in tuple(sorted(partition))] 1038 partition = [self.dfield(fld) for fld in tuple(partition)] 1039 secondary = [] 1040 for field in partition: 1041 self._add_child(field, secondary) 1042 secondary = [fld for fld in secondary if not fld in partition] 1043 unique = [fld for fld in self.fields if fld.category == UNIQUE] 1044 mixte = list(self._mixte_dims(partition, partitions)) 1045 variable = [fld for fld in self.fields 1046 if not fld in partition + secondary + unique + mixte] 1047 return Util.view({'primary': partition, 'secondary': secondary, 1048 'mixte': mixte, 'unique': unique, 1049 'variable': variable}, mode) 1050 1051 def relation_partition(self, partition=None, primary=False, noroot=False): 1052 '''return a dict with the list of relationships for fields in a partition. 1053 1054 *Parameters* 1055 1056 - **partition** : list (default None) - if None, partition is the first 1057 - **primary** : boolean (default False) - if True, relations are primary fields 1058 - **noroot** : boolean (default False) - if True and single primary, 1059 'root' field is replaced by the primary field''' 1060 partitions = self.partitions(mode='field') 1061 if not partitions: 1062 partition = None 1063 else: 1064 partition = Util.view(partition, mode='field', 1065 ana=self) if partition else partitions[0] 1066 part = self.field_partition( 1067 mode='field', partition=partition, distributed=True) 1068 fields_cat = {fld: cat for cat, l_fld in part.items() for fld in l_fld} 1069 relations = {} 1070 for field in fields_cat: 1071 rel = [] 1072 match fields_cat[field]: 1073 case 'primary': 1074 rel = [field.idfield] 1075 case 'unique': ... 1076 case 'variable': 1077 rel = [fld.idfield for fld in part['primary']] 1078 case 'secondary' if not primary: 1079 rel = [field.p_derived.idfield] 1080 case 'secondary' if primary: 1081 rel = [fld.idfield for fld in field.ascendants() 1082 if fld in part['primary']] 1083 case 'mixte': 1084 rel = [fld.idfield for fld in self._mixte_dims( 1085 partition, partitions)[field]] 1086 case _: ... 1087 if rel == ['root'] and len(part['primary']) == 1 and noroot: 1088 rel = [part['primary'][0].idfield] 1089 if rel == ['root'] and len(part['primary']) == 0 and noroot: 1090 rel = [part['secondary'][0].idfield] 1091 relations[field.idfield] = rel 1092 return relations 1093 1094 def indicator(self, fullsize, size): 1095 '''generate size indicators: ol (object lightness), ul (unicity level), 1096 gain (sizegain) 1097 1098 *Parameters* 1099 1100 - **fullsize** : int - size with full codec 1101 - **size** : int - size with existing codec 1102 1103 *Returns* : dict''' 1104 lenindex = len(self.fields) 1105 indexlen = sum(fld.lencodec for fld in self.fields) 1106 nval = len(self) * (lenindex + 1) 1107 sval = fullsize / nval 1108 ncod = indexlen + lenindex 1109 1110 if nval != ncod: 1111 scod = (size - ncod * sval) / (nval - ncod) 1112 olight = scod / sval 1113 else: 1114 olight = None 1115 return {'total values': nval, 'mean size': round(sval, 3), 1116 'unique values': ncod, 'mean coding size': round(scod, 3), 1117 'unicity level': round(ncod / nval, 3), 1118 'optimize level': round(size / fullsize, 3), 1119 'object lightness': round(olight, 3), 1120 'maxgain': round((nval - ncod) / nval, 3), 1121 'gain': round((fullsize - size) / fullsize, 3)} 1122 1123 def _add_child(self, field, childs): 1124 ''' add derived or coupled fields in the childs list''' 1125 for rel in field.list_c_derived + field.list_coupled: 1126 child = rel.relation[1] 1127 if not child in childs and not child.category == UNIQUE: 1128 childs.append(child) 1129 if not child.category in (COUPLED, UNIQUE): 1130 self._add_child(child, childs) 1131 1132 def _mixte_dims(self, partition, partitions): 1133 '''return dict with dimensions associated to each mixte field''' 1134 dic_mixte = {} 1135 for part in partitions: 1136 not_part = [fld for fld in part if not fld in partition] 1137 if len(not_part) == 1 and len(partition) > len(part) > 1: 1138 sub_part = [fld for fld in partition if not fld in part] 1139 if min(self.get_relation(not_part[0], fld).typecoupl == 'derived' 1140 for fld in sub_part) is True: 1141 dic_mixte[not_part[0]] = sub_part 1142 return dic_mixte
This class analyses the structure of a dataset.
Attributes :
- iddataset : string or integer - Id of the Dataset
- fields : list of the AnaDfields included
- relations : dict of the AnaRelations between two AnaDfields
- hashd : string - update identifier
relationship (@property)
field (@property)
global (@property)
update (instance methods)
access (instance methods)
synthesis (instance methods)
753 def __init__(self, fields=None, relations=None, iddataset=None, 754 leng=None, hashd=None): 755 '''Creation mode : 756 - single dict attribute where keys are attributes name, 757 - single AnaDataset attribute to make a copy 758 - multiple attributes 759 760 *Parameters (single dict)* 761 762 - **fields**: {'fields': list_of_dict, 'name': id_dataset, 763 'length': length, 'relations': dict_of_relations 764 where: 765 list_of_dict : {'id': id_field, 'lencodec': len_codec, 'mincodec': min_codec} 766 id_field: string - name of field 767 other_field: string - name of field 768 len_codec: int - length of the codec 769 min_codec: int - number of different codec values 770 id_dataset : name of the dataset 771 length: int - length of the dataset 772 dict_of_relations: {id_field : {other_field: dist} for all fields} 773 field: name of a field 774 field_other: name of another field 775 dist: integer (distance between the two fields) or 776 array (distance and boolean distributed) 777 778 *Parameters (multiple attributes)* 779 780 - **fields**: list_of_dict 781 - **iddataset** : string (default None) - id_dataset 782 - **relations** : dict (default None) - dict_of_relations 783 - **leng** : int (default None) - length 784 - **hashd** : string (default None) - update identifier 785 ''' 786 if isinstance(fields, AnaDataset): 787 self.iddataset = fields.iddataset 788 self.fields = fields.fields 789 self.relations = fields.relations 790 self.hashd = fields.hashd 791 return 792 if isinstance(fields, dict): 793 iddataset = fields.get(IDDATASET, None) 794 leng = fields.get(LENGTH, None) 795 relations = fields.get(RELATIONS, None) 796 hashd = fields.get(HASHD) 797 fields = fields.get(FIELDS, None) 798 self.iddataset = iddataset 799 self.fields = [AnaDfield(AnaField(field), self) 800 for field in fields] if fields else [] 801 if leng: 802 for fld in self.fields: 803 fld.maxcodec = leng 804 self.relations = {field: {} for field in self.fields} 805 if relations: 806 for fld, dic_relation in relations.items(): 807 self.set_relations(fld, dic_relation) 808 self.hashd = hashd
Creation mode :
- single dict attribute where keys are attributes name,
- single AnaDataset attribute to make a copy
- multiple attributes
Parameters (single dict)
- fields: {'fields': list_of_dict, 'name': id_dataset, 'length': length, 'relations': dict_of_relations where: list_of_dict : {'id': id_field, 'lencodec': len_codec, 'mincodec': min_codec} id_field: string - name of field other_field: string - name of field len_codec: int - length of the codec min_codec: int - number of different codec values id_dataset : name of the dataset length: int - length of the dataset dict_of_relations: {id_field : {other_field: dist} for all fields} field: name of a field field_other: name of another field dist: integer (distance between the two fields) or array (distance and boolean distributed)
Parameters (multiple attributes)
- fields: list_of_dict
- iddataset : string (default None) - id_dataset
- relations : dict (default None) - dict_of_relations
- leng : int (default None) - length
- hashd : string (default None) - update identifier
825 @property 826 def category(self): 827 '''return a list of AnaDfield category (unique, rooted, coupled, derived, mixed)''' 828 return [fld.category for fld in self.fields]
return a list of AnaDfield category (unique, rooted, coupled, derived, mixed)
830 @property 831 def ana_relations(self): 832 '''return the list of AnaRelation included''' 833 return [rel for fldrel in self.relations.values() for rel in fldrel.values()]
return the list of AnaRelation included
835 @property 836 def p_relations(self): 837 '''return the list of oriented AnaRelation (parent first, child second)''' 838 return [rel for rel in self.ana_relations if rel.parent_child]
return the list of oriented AnaRelation (parent first, child second)
840 @property 841 def root(self): 842 '''return the root AnaDfield''' 843 len_self = len(self) 844 return AnaDfield(AnaField(ROOT, len_self, len_self, len_self), self)
return the root AnaDfield
846 @property 847 def primary(self): 848 '''return the first partition of the partitions''' 849 return self.field_partition(mode='field')['primary'] 850 # part = self.partitions(mode='field', distributed=True) 851 # return part[0] if part else []
return the first partition of the partitions
853 @property 854 def complete(self): 855 '''return True if the dimension is not 0''' 856 return self.dimension > 0
return True if the dimension is not 0
858 @property 859 def dimension(self): 860 '''return the highest partition lenght''' 861 return len(self.primary)
return the highest partition lenght
863 @property 864 def secondary(self): 865 '''return the derived ou coupled fields from primary''' 866 return self.field_partition(mode='field')['secondary']
return the derived ou coupled fields from primary
868 @property 869 def unique(self): 870 '''return the unique fields''' 871 return [fld for fld in self.fields if fld.category == UNIQUE]
return the unique fields
873 @property 874 def variable(self): 875 '''return the variable fields''' 876 return self.field_partition(mode='field')['variable']
return the variable fields
878 @property 879 def mixte(self): 880 '''return the variable fields''' 881 return self.field_partition(mode='field')['mixte']
return the variable fields
883 def set_relations(self, field, dic_relations): 884 '''Add relations in the AnaDataset from a dict. 885 886 *Parameters* 887 888 - **field** : AnaDfield, AnaField or str (idfield) - first relation AnaDfield 889 - **dic_relations** : dict - key is the second relation AnaDfield and 890 value is the dist value or teh list [dist, distrib] 891 ''' 892 fld = self.dfield(field) 893 for other, dist in dic_relations.items(): 894 oth = self.dfield(other) 895 self.relations[fld][oth] = AnaRelation([fld, oth], dist) 896 self.relations[oth][fld] = AnaRelation([oth, fld], dist)
Add relations in the AnaDataset from a dict.
Parameters
- field : AnaDfield, AnaField or str (idfield) - first relation AnaDfield
- dic_relations : dict - key is the second relation AnaDfield and value is the dist value or teh list [dist, distrib]
898 def get_relation(self, fld1, fld2): 899 '''Return AnaRelation between fld1 and fld2. 900 901 *Parameters* 902 903 - **fld1** : AnaDfield, AnaField, int or str (idfield) - first relation AnaDfield 904 - **fld2** : AnaDfield, AnaField, int or str (idfield) - second relation AnaDfield 905 ''' 906 fl1 = self.dfield(fld1) 907 fl2 = self.dfield(fld2) 908 if self.root in [fl1, fl2]: 909 return AnaRelation([fl1, fl2], len(self)) 910 return self.relations[self.dfield(fld1)][self.dfield(fld2)]
Return AnaRelation between fld1 and fld2.
Parameters
- fld1 : AnaDfield, AnaField, int or str (idfield) - first relation AnaDfield
- fld2 : AnaDfield, AnaField, int or str (idfield) - second relation AnaDfield
912 def dfield(self, fld): 913 '''return the AnaDfield matching with fld. Fld is str, int, AnaDfield or AnaField''' 914 if fld in (-1, ROOT): 915 return self.root 916 if isinstance(fld, AnaDfield): 917 return fld 918 if isinstance(fld, int): 919 return self.fields[fld] 920 if isinstance(fld, str): 921 if fld in [dfld.idfield for dfld in self.fields]: 922 return [dfld for dfld in self.fields if dfld.idfield == fld][0] 923 return None 924 return AnaDfield(fld, self)
return the AnaDfield matching with fld. Fld is str, int, AnaDfield or AnaField
926 def tree(self, mode='derived', width=5, lname=20, string=True): 927 '''return a string with a tree of derived Field. 928 929 *Parameters* 930 931 - **lname** : integer (default 20) - length of the names 932 - **width** : integer (default 5) - length of the lines 933 - **string** : boolean (default True) - if True return str else return dict 934 - **mode** : string (default 'derived') - kind of tree : 935 'derived' : derived tree 936 'distance': min distance tree 937 'distomin': min distomin tree 938 ''' 939 lis = ['root-' + mode + '*(' + str(len(self)) + ')'] 940 if mode == 'distance': 941 childs = [fld for fld in self.fields if fld.p_distance == self.root] 942 elif mode == 'distomin': 943 childs = [fld for fld in self.fields if fld.p_distomin == self.root] 944 elif mode == 'derived': 945 childs = [fld for fld in self.fields if fld.p_derived == self.root] 946 for fld in childs: 947 lis.append(fld.dic_inner_node(mode, lname)) 948 tree = {str(-1).ljust(2, '*'): lis} 949 if string: 950 tre = pprint.pformat(tree, indent=0, width=width) 951 tre = tre.replace('---', ' - ') 952 tre = tre.replace(' ', ' ') 953 tre = tre.replace('*', ' ') 954 for car in ["'", "\"", "{", "[", "]", "}", ","]: 955 tre = tre.replace(car, "") 956 return tre 957 return Util.clean_dic(tree, '*', ' ')
return a string with a tree of derived Field.
Parameters
- lname : integer (default 20) - length of the names
- width : integer (default 5) - length of the lines
- string : boolean (default True) - if True return str else return dict
- mode : string (default 'derived') - kind of tree : 'derived' : derived tree 'distance': min distance tree 'distomin': min distomin tree
959 def to_dict(self, mode='field', keys=None, relations=False): 960 '''return a dict with fields attributes and optionaly relations attributes. 961 962 *Parameters* 963 964 - **mode** : str (default 'field') - AnaDfield representation 965 ('field', 'id', 'index') 966 - **relations** : boolean (default: False) - if False return a list of fields, 967 if True return a dict '{"fields": <list of fields>, "relations": <list of relations>}' 968 - **keys** : string, list or tuple - list of keys or single key to return 969 if 'all' or None, all keys are returned 970 if list, only keys in list are returned 971 if string, only values associated to the string(key) are returned''' 972 fields = Util.filter_dic([fld.to_dict(mode=mode) 973 for fld in self.fields], keys) 974 leng = len(self.fields) 975 if not relations: 976 return fields 977 return {'fields': fields, 'relations': 978 [self.get_relation(i, j).to_dict(full=True, mode=mode) 979 for i in range(-1, leng) for j in range(i + 1, leng)]}
return a dict with fields attributes and optionaly relations attributes.
Parameters
- mode : str (default 'field') - AnaDfield representation ('field', 'id', 'index')
- relations : boolean (default: False) - if False return a list of fields,
if True return a dict '{"fields":
- , "relations":
- }'
- keys : string, list or tuple - list of keys or single key to return if 'all' or None, all keys are returned if list, only keys in list are returned if string, only values associated to the string(key) are returned
981 def partitions(self, mode='id', distributed=True): 982 '''return a list of available partitions (the first is highest). 983 984 *Parameters* 985 986 - **mode** : str (default 'id') - AnaDfield representation 987 ('field', 'id', 'index') 988 - **distributed** : boolean (default True) - Include only distributed fields 989 ''' 990 partit = [[fld] for fld in self.fields if fld.category == ROOTED] 991 crossed = [rel for rel in self.ana_relations if rel.typecoupl == CROSSED 992 and rel.parent_child 993 and rel.relation[0].category != COUPLED 994 and rel.relation[1].category != COUPLED] 995 if distributed: 996 crossed = [rel for rel in crossed if rel.distrib] 997 if crossed and len(crossed) == 1 and crossed[0].dist == len(self): 998 partit.insert(0, crossed[0].relation) 999 elif crossed: 1000 for repeat in list(range(len(crossed))): 1001 candidates = combinations(crossed, repeat + 1) 1002 for candidat in candidates: 1003 flds = list(set(rel.relation[i] 1004 for rel in candidat for i in [0, 1])) 1005 if (reduce(mul, [fld.lencodec for fld in flds]) == len(self) and 1006 len(candidat) == sum(range(len(flds))) and 1007 (not distributed or min(rel.distrib for rel in candidat))): 1008 partit.insert(0, flds) 1009 partit = [list(tup) for tup in 1010 sorted(sorted(list({tuple(sorted(prt)) for prt in partit})), 1011 key=len, reverse=True)] 1012 return Util.view(partit, mode)
return a list of available partitions (the first is highest).
Parameters
- mode : str (default 'id') - AnaDfield representation ('field', 'id', 'index')
- distributed : boolean (default True) - Include only distributed fields
1014 def field_partition(self, mode='id', partition=None, distributed=True): 1015 '''return a partition dict with the list of primary, secondary, unique 1016 and variable fields. 1017 1018 *Parameters* 1019 1020 - **mode** : str (default 'id') - AnaDfield representation 1021 ('field', 'id', 'index') 1022 - **partition** : list of str, int, AnaDfield or AnaField(default None) - 1023 if None, partition is the first 1024 - **distributed** : boolean (default True) - Include only distributed fields 1025 ''' 1026 partitions = self.partitions(mode='field', distributed=distributed) 1027 if not partitions: 1028 return Util.view( 1029 {'primary': [], 'secondary': [ 1030 fld for fld in self.fields if fld.category != UNIQUE], 1031 'mixte': [], 'unique': [ 1032 fld for fld in self.fields if fld.category == UNIQUE], 1033 'variable': []}, mode) 1034 if not partition: 1035 partition = partitions[0] 1036 else: 1037 # partition = [self.dfield(fld) for fld in tuple(sorted(partition))] 1038 partition = [self.dfield(fld) for fld in tuple(partition)] 1039 secondary = [] 1040 for field in partition: 1041 self._add_child(field, secondary) 1042 secondary = [fld for fld in secondary if not fld in partition] 1043 unique = [fld for fld in self.fields if fld.category == UNIQUE] 1044 mixte = list(self._mixte_dims(partition, partitions)) 1045 variable = [fld for fld in self.fields 1046 if not fld in partition + secondary + unique + mixte] 1047 return Util.view({'primary': partition, 'secondary': secondary, 1048 'mixte': mixte, 'unique': unique, 1049 'variable': variable}, mode)
return a partition dict with the list of primary, secondary, unique and variable fields.
Parameters
- mode : str (default 'id') - AnaDfield representation ('field', 'id', 'index')
- partition : list of str, int, AnaDfield or AnaField(default None) - if None, partition is the first
- distributed : boolean (default True) - Include only distributed fields
1051 def relation_partition(self, partition=None, primary=False, noroot=False): 1052 '''return a dict with the list of relationships for fields in a partition. 1053 1054 *Parameters* 1055 1056 - **partition** : list (default None) - if None, partition is the first 1057 - **primary** : boolean (default False) - if True, relations are primary fields 1058 - **noroot** : boolean (default False) - if True and single primary, 1059 'root' field is replaced by the primary field''' 1060 partitions = self.partitions(mode='field') 1061 if not partitions: 1062 partition = None 1063 else: 1064 partition = Util.view(partition, mode='field', 1065 ana=self) if partition else partitions[0] 1066 part = self.field_partition( 1067 mode='field', partition=partition, distributed=True) 1068 fields_cat = {fld: cat for cat, l_fld in part.items() for fld in l_fld} 1069 relations = {} 1070 for field in fields_cat: 1071 rel = [] 1072 match fields_cat[field]: 1073 case 'primary': 1074 rel = [field.idfield] 1075 case 'unique': ... 1076 case 'variable': 1077 rel = [fld.idfield for fld in part['primary']] 1078 case 'secondary' if not primary: 1079 rel = [field.p_derived.idfield] 1080 case 'secondary' if primary: 1081 rel = [fld.idfield for fld in field.ascendants() 1082 if fld in part['primary']] 1083 case 'mixte': 1084 rel = [fld.idfield for fld in self._mixte_dims( 1085 partition, partitions)[field]] 1086 case _: ... 1087 if rel == ['root'] and len(part['primary']) == 1 and noroot: 1088 rel = [part['primary'][0].idfield] 1089 if rel == ['root'] and len(part['primary']) == 0 and noroot: 1090 rel = [part['secondary'][0].idfield] 1091 relations[field.idfield] = rel 1092 return relations
return a dict with the list of relationships for fields in a partition.
Parameters
- partition : list (default None) - if None, partition is the first
- primary : boolean (default False) - if True, relations are primary fields
- noroot : boolean (default False) - if True and single primary, 'root' field is replaced by the primary field
1094 def indicator(self, fullsize, size): 1095 '''generate size indicators: ol (object lightness), ul (unicity level), 1096 gain (sizegain) 1097 1098 *Parameters* 1099 1100 - **fullsize** : int - size with full codec 1101 - **size** : int - size with existing codec 1102 1103 *Returns* : dict''' 1104 lenindex = len(self.fields) 1105 indexlen = sum(fld.lencodec for fld in self.fields) 1106 nval = len(self) * (lenindex + 1) 1107 sval = fullsize / nval 1108 ncod = indexlen + lenindex 1109 1110 if nval != ncod: 1111 scod = (size - ncod * sval) / (nval - ncod) 1112 olight = scod / sval 1113 else: 1114 olight = None 1115 return {'total values': nval, 'mean size': round(sval, 3), 1116 'unique values': ncod, 'mean coding size': round(scod, 3), 1117 'unicity level': round(ncod / nval, 3), 1118 'optimize level': round(size / fullsize, 3), 1119 'object lightness': round(olight, 3), 1120 'maxgain': round((nval - ncod) / nval, 3), 1121 'gain': round((fullsize - size) / fullsize, 3)}
generate size indicators: ol (object lightness), ul (unicity level), gain (sizegain)
Parameters
- fullsize : int - size with full codec
- size : int - size with existing codec
Returns : dict
1145class Util: 1146 ''' common functions for analysis package''' 1147 1148 @staticmethod 1149 def view(field_struc, mode, ana=None): 1150 ''' return a representation of a AnaDfields structure (field, id, index). 1151 1152 *Parameters* 1153 1154 - **mode** : str - AnaDfield representation ('field', 'id', 'index') 1155 - **field_struc** : list or dict - structure to represent 1156 - **ana** : AnaDataset (default None) - to convert string or index in AnaDfield 1157 ''' 1158 1159 if mode is None or not field_struc: 1160 return field_struc 1161 if isinstance(field_struc, dict): 1162 return {key: Util.view(val, mode=mode, ana=ana) 1163 for key, val in field_struc.items()} 1164 if isinstance(field_struc, list): 1165 return [Util.view(val, mode=mode, ana=ana) for val in field_struc] 1166 if not isinstance(field_struc, AnaDfield) and mode != 'id': 1167 return Util.view(ana.dfield(field_struc), mode=mode) 1168 return field_struc if mode == 'field' else ( 1169 field_struc.index if mode == 'index' else field_struc.idfield) 1170 1171 @staticmethod 1172 def reduce_dic(obj, notempty=False): 1173 '''return a dict without None values''' 1174 if isinstance(obj, dict): 1175 return {key: Util.reduce_dic(val) for key, val in obj.items() 1176 if not val is None and (not notempty or val)} 1177 if isinstance(obj, list): 1178 return [Util.reduce_dic(val) for val in obj] 1179 return obj 1180 1181 @staticmethod 1182 def clean_dic(obj, old, new): 1183 '''return a dict or list with updated strings by replacing "old" substring 1184 with "new" substring''' 1185 if isinstance(obj, dict): 1186 return {Util.clean_dic(key, old, new): Util.clean_dic(val, old, new) 1187 for key, val in obj.items()} 1188 if isinstance(obj, str): 1189 return obj.replace(old, new) 1190 if isinstance(obj, list): 1191 return [Util.clean_dic(val, old, new) for val in obj] 1192 return obj 1193 1194 @staticmethod 1195 def filter_dic(obj, keys): 1196 '''return extract of a list of dict or of a dict 1197 1198 *Parameters* 1199 1200 - **keys** : string, list or tuple - list of keys or single key to return 1201 if 'all' or None, all keys are returned 1202 if list, only keys in list are returned 1203 if string, only values associated to the string(key) are returned''' 1204 if not keys or keys == 'all': 1205 return obj 1206 if isinstance(obj, list): 1207 return [Util.filter_dic(dic, keys) for dic in obj] 1208 if isinstance(keys, str) and isinstance(obj, dict): 1209 return obj.get(keys, None) 1210 if isinstance(keys, (list, tuple)) and isinstance(obj, dict): 1211 return {key: val for key, val in obj.items() if key in keys} 1212 return obj
common functions for analysis package
1148 @staticmethod 1149 def view(field_struc, mode, ana=None): 1150 ''' return a representation of a AnaDfields structure (field, id, index). 1151 1152 *Parameters* 1153 1154 - **mode** : str - AnaDfield representation ('field', 'id', 'index') 1155 - **field_struc** : list or dict - structure to represent 1156 - **ana** : AnaDataset (default None) - to convert string or index in AnaDfield 1157 ''' 1158 1159 if mode is None or not field_struc: 1160 return field_struc 1161 if isinstance(field_struc, dict): 1162 return {key: Util.view(val, mode=mode, ana=ana) 1163 for key, val in field_struc.items()} 1164 if isinstance(field_struc, list): 1165 return [Util.view(val, mode=mode, ana=ana) for val in field_struc] 1166 if not isinstance(field_struc, AnaDfield) and mode != 'id': 1167 return Util.view(ana.dfield(field_struc), mode=mode) 1168 return field_struc if mode == 'field' else ( 1169 field_struc.index if mode == 'index' else field_struc.idfield)
return a representation of a AnaDfields structure (field, id, index).
Parameters
- mode : str - AnaDfield representation ('field', 'id', 'index')
- field_struc : list or dict - structure to represent
- ana : AnaDataset (default None) - to convert string or index in AnaDfield
1171 @staticmethod 1172 def reduce_dic(obj, notempty=False): 1173 '''return a dict without None values''' 1174 if isinstance(obj, dict): 1175 return {key: Util.reduce_dic(val) for key, val in obj.items() 1176 if not val is None and (not notempty or val)} 1177 if isinstance(obj, list): 1178 return [Util.reduce_dic(val) for val in obj] 1179 return obj
return a dict without None values
1181 @staticmethod 1182 def clean_dic(obj, old, new): 1183 '''return a dict or list with updated strings by replacing "old" substring 1184 with "new" substring''' 1185 if isinstance(obj, dict): 1186 return {Util.clean_dic(key, old, new): Util.clean_dic(val, old, new) 1187 for key, val in obj.items()} 1188 if isinstance(obj, str): 1189 return obj.replace(old, new) 1190 if isinstance(obj, list): 1191 return [Util.clean_dic(val, old, new) for val in obj] 1192 return obj
return a dict or list with updated strings by replacing "old" substring with "new" substring
1194 @staticmethod 1195 def filter_dic(obj, keys): 1196 '''return extract of a list of dict or of a dict 1197 1198 *Parameters* 1199 1200 - **keys** : string, list or tuple - list of keys or single key to return 1201 if 'all' or None, all keys are returned 1202 if list, only keys in list are returned 1203 if string, only values associated to the string(key) are returned''' 1204 if not keys or keys == 'all': 1205 return obj 1206 if isinstance(obj, list): 1207 return [Util.filter_dic(dic, keys) for dic in obj] 1208 if isinstance(keys, str) and isinstance(obj, dict): 1209 return obj.get(keys, None) 1210 if isinstance(keys, (list, tuple)) and isinstance(obj, dict): 1211 return {key: val for key, val in obj.items() if key in keys} 1212 return obj
return extract of a list of dict or of a dict
Parameters
- keys : string, list or tuple - list of keys or single key to return if 'all' or None, all keys are returned if list, only keys in list are returned if string, only values associated to the string(key) are returned
Analysis Exception
Inherited Members
- builtins.Exception
- Exception
- builtins.BaseException
- with_traceback
- add_note
- args