python.observation.dataset_analysis
Created on Sun Oct 2 22:24:59 2022
@author: philippe@loco-labs.io
The python.observation.dataset_analysis
module contains the Analysis
class.
1# -*- coding: utf-8 -*- 2""" 3Created on Sun Oct 2 22:24:59 2022 4 5@author: philippe@loco-labs.io 6 7The `python.observation.dataset_analysis` module contains the `Analysis` class. 8 9""" 10 11# %% declarations 12from copy import copy 13import pprint 14from collections import Counter 15 16from observation.util import util 17 18 19class Analysis: 20 '''This class analyses relationships included in a tabular object 21 (Pandas DataFrame, Dataset, Observation, list of list). 22 23 The Analysis class includes the following functions: 24 - identification and qualification of the relationships between Field, 25 - generation of the global properties of the structure 26 - data actualization based on structure updates 27 28 *Attributes* : 29 30 - **iobj** : Dataset or Observation associated to the Analysis object 31 - **hashi** : internal Id of the iobj 32 - **matrix** : square matrix with relationship properties between two fields 33 - **infos** : list of characteristics (matrix synthesis) 34 - **primary** : list of 'primary' fields row 35 - **secondary** : list of 'secondary' fields row 36 - **lvarname** : list of 'variable' fields name 37 38 The methods defined in this class are : 39 40 - `Analysis.actualize` 41 - `Analysis.actualize` 42 - `Analysis.check_relationship` 43 - `Analysis.getmatrix` 44 - `Analysis.getvarname` 45 - `Analysis.getsecondary` 46 - `Analysis.getprimary` 47 - `Analysis.getpartition` 48 - `Analysis.tree` 49 ''' 50 # %% methods 51 52 def __init__(self, iobj): 53 '''Analysis constructor. 54 55 *Parameters* 56 57 - **iobj** : object - tabular object (Pandas DataFrame, Dataset, Observation, 58 list of list) 59 60 Note: The Analysis data can be update only if tabular object is Dataset or 61 Observation. 62 ''' 63 if iobj.__class__.__name__ in ('Dataset', 'Observation', 'Ndataset', 'Sdataset'): 64 self.iobj = iobj 65 elif iobj.__class__.__name__ == 'DataFrame': 66 from observation import Sdataset 67 self.iobj = Sdataset(iobj) 68 else: 69 from dataset import Dataset 70 self.iobj = Dataset.obj(iobj) 71 self.hashi = None 72 self.matrix = None 73 self.infos = None 74 self.primary = None 75 self.secondary = None 76 self.lvarname = None 77 self.partition = [] 78 self.groups = [] 79 80 def actualize(self, partition=None): 81 ''' update all data with new values of iobj 82 83 *Parameters* 84 85 - **partition** : list of int (default None) - partition to be used ''' 86 self.matrix = self._setmatrix() 87 self._setinfos() 88 self._setparent() 89 self._setgroups() 90 self._setpartition() 91 self._setinfospartition(partition) 92 self.hashi = self.iobj._hashi() 93 self.lvarname = [idx['name'] 94 for idx in self.infos if idx['cat'] == 'variable'] 95 coupledvar = [idx['name'] for idx in self.infos if idx['cat'] == 'coupled' 96 and self.infos[idx['parent']]['cat'] == 'variable'] 97 self.lvarname += coupledvar 98 self.secondary = [idx['num'] 99 for idx in self.infos if idx['cat'] == 'secondary'] 100 coupledsec = [idx['num'] for idx in self.infos if idx['cat'] == 'coupled' 101 and self.infos[idx['parent']]['cat'] in ('primary', 'secondary')] 102 self.secondary += coupledsec 103 #infosidx = [idx for idx in self.infos if idx['cat'] != 'variable'] 104 infosidx = [idx for idx in self.infos if idx['cat'] != 'variable' and 105 not (idx['cat'] == 'coupled' and 106 self.infos[idx['parent']]['cat'] == 'variable') ] 107 self.primary = [infosidx.index(idx) 108 for idx in infosidx if idx['cat'] == 'primary'] 109 110 def check_relationship(self, relations): 111 '''get the list of inconsistent records for each relationship defined in relations 112 113 *Parameters* 114 115 - **relations** : list of dict - list of fields with relationship property 116 117 *Returns* : dict with for each relationship: key = pair of name, 118 and value = list of inconsistent records''' 119 if not isinstance(relations, (list, dict)): 120 raise AnalysisError("relations is not correct") 121 if isinstance(relations, dict): 122 relations = [relations] 123 dic_res = {} 124 for field in relations: 125 if not 'relationship' in field or not 'name' in field: 126 continue 127 if not 'parent' in field['relationship'] or not 'link' in field['relationship']: 128 raise AnalysisError("relationship is not correct") 129 rel = field['relationship']['link'] 130 f_parent = self.iobj.nindex(field['relationship']['parent']) 131 f_field = self.iobj.nindex(field['name']) 132 name_rel = field['name'] + ' - ' + field['relationship']['parent'] 133 if f_parent is None or f_field is None: 134 raise AnalysisError("field's name are not present in data") 135 match rel: 136 case 'derived': 137 dic_res[name_rel] = f_parent.coupling(f_field, reindex=True) 138 case 'coupled': 139 dic_res[name_rel] = f_parent.coupling(f_field, derived=False, reindex=True) 140 case _: 141 raise AnalysisError(rel + "is not a valid relationship") 142 return dic_res 143 144 def getinfos(self, keys=None): 145 '''return attribute infos 146 147 *Parameters* 148 149 - **keys** : string, list or tuple (default None) - list of attributes to returned 150 if 'all' or None, all attributes are returned 151 if 'struct', only structural attributes are returned''' 152 if self.hashi != self.iobj._hashi(): 153 self.actualize() 154 if keys == 'struct': 155 keys = ['num', 'name', 'cat', 'child', 'crossed', 'distparent', 156 'diffdistparent', 'parent', 'pparent', 'rateder', 'ratecpl'] 157 if not keys or keys == 'all': 158 return self.infos 159 return [{k: v for k, v in inf.items() if k in keys} for inf in self.infos] 160 161 def getmatrix(self, name=None): 162 '''return attribute matrix or only one value of the matrix defined by two names 163 164 *Parameters* 165 166 - **name** : list or tuple (default None) - list of two fields names 167 ''' 168 if self.hashi != self.iobj._hashi(): 169 self.actualize() 170 if not name or not isinstance(name, list): 171 return self.matrix 172 if name[0] in self.iobj.lname: 173 ind0 = self.iobj.lname.index(name[0]) 174 if len(name) == 1: 175 return self.matrix[ind0] 176 if len(name) > 1 and name[1] in self.iobj.lname: 177 return self.matrix[ind0][self.iobj.lname.index(name[1])] 178 return None 179 180 def getvarname(self): 181 '''return variable Field name''' 182 if self.hashi != self.iobj._hashi(): 183 self.actualize() 184 return self.lvarname 185 186 def getprimary(self): 187 '''return attribute primary''' 188 if self.hashi != self.iobj._hashi(): 189 self.actualize() 190 return self.primary 191 192 def getsecondary(self): 193 '''return attribute secondary''' 194 if self.hashi != self.iobj._hashi(): 195 self.actualize() 196 return self.secondary 197 198 def getpartition(self): 199 '''return attribute partition''' 200 if self.hashi != self.iobj._hashi(): 201 self.actualize() 202 return self.partition 203 204 def getgroups(self): 205 '''return attribute groups''' 206 if self.hashi != self.iobj._hashi(): 207 self.actualize() 208 return self.groups 209 210 def tree(self, mode='derived', width=5, lname=20, string=True): 211 '''return a string with a tree of derived Field. 212 213 *Parameters* 214 215 - **lname** : integer (default 20) - length of the names 216 - **width** : integer (default 5) - length of the lines 217 - **mode** : string (default 'derived') - kind of tree : 218 'derived' : derived tree 219 'distance': min distance tree 220 'diff': min dist rate tree 221 ''' 222 if mode == 'derived': 223 modeparent = 'parent' 224 elif mode == 'distance': 225 modeparent = 'minparent' 226 elif mode == 'diff': 227 modeparent = 'distparent' 228 else: 229 raise AnalysisError('mode is unknown') 230 if self.hashi != self.iobj._hashi(): 231 self.actualize() 232 child = [None] * (len(self.infos) + 1) 233 for i in range(len(self.infos)): 234 parent = self.infos[i][modeparent] 235 if child[parent + 1] is None: 236 child[parent + 1] = [] 237 child[parent + 1].append(i) 238 tr = self._dic_noeud(-1, child, lname, mode) 239 if string: 240 tre = pprint.pformat(tr, indent=0, width=width) 241 tre = tre.replace('---', ' - ') 242 tre = tre.replace(' ', ' ') 243 tre = tre.replace('*', ' ') 244 for c in ["'", "\"", "{", "[", "]", "}", ","]: 245 tre = tre.replace(c, "") 246 return tre 247 return tr 248 249 # %% internal methods 250 def _setmatrix(self): 251 '''set and return matrix attributes (coupling infos between each idx)''' 252 lenindex = self.iobj.lenindex 253 mat = [[None for i in range(lenindex)] for i in range(lenindex)] 254 for i in range(lenindex): 255 for j in range(i, lenindex): 256 mat[i][j] = self.iobj.lindex[i].couplinginfos( 257 self.iobj.lindex[j]) 258 for j in range(i): 259 mat[i][j] = copy(mat[j][i]) 260 if mat[i][j]['typecoupl'] == 'derived': 261 mat[i][j]['typecoupl'] = 'derive' 262 elif mat[i][j]['typecoupl'] == 'derive': 263 mat[i][j]['typecoupl'] = 'derived' 264 elif mat[i][j]['typecoupl'] == 'linked': 265 mat[i][j]['typecoupl'] = 'link' 266 elif mat[i][j]['typecoupl'] == 'link': 267 mat[i][j]['typecoupl'] = 'linked' 268 return mat 269 270 def _setinfos(self): 271 '''set and return attribute 'infos'. 272 Infos is an array with infos of each index : 273 - num, name, cat, child, crossed, distparent, diffdistparent, 274 parent, pparent, rateder''' 275 lenindex = self.iobj.lenindex 276 leniobj = len(self.iobj) 277 self.infos = [{} for i in range(lenindex)] 278 for i in range(lenindex): 279 self.infos[i]['num'] = i 280 self.infos[i]['name'] = self.iobj.lname[i] 281 self.infos[i]['cat'] = 'null' 282 self.infos[i]['parent'] = -1 283 self.infos[i]['distparent'] = -1 284 self.infos[i]['minparent'] = -1 285 self.infos[i]['pparent'] = -2 286 self.infos[i]['diffdistparent'] = -1 287 self.infos[i]['distance'] = leniobj * leniobj 288 self.infos[i]['ratecpl'] = 1 289 self.infos[i]['rateder'] = 1 290 self.infos[i]['child'] = [] 291 self.infos[i]['crossed'] = [] 292 self.infos[i] |= self.iobj.lindex[i].infos 293 if self.infos[i]['typecodec'] == 'unique': 294 self.infos[i]['pparent'] = -1 295 self.infos[i]['cat'] = 'unique' 296 self.infos[i]['diffdistparent'] = leniobj - 1 297 self.infos[i]['rateder'] = 0 298 for i in range(lenindex): 299 for j in range(i+1, lenindex): 300 if self.matrix[i][j]['typecoupl'] == 'coupled' and \ 301 self.infos[j]['parent'] == -1: 302 self.infos[j]['parent'] = i 303 self.infos[j]['distparent'] = i 304 self.infos[j]['diffdistparent'] = 0 305 self.infos[j]['rateder'] = 0 306 self.infos[j]['cat'] = 'coupled' 307 self.infos[i]['child'].append(j) 308 return 309 310 def _setinfospartition(self, partition=None): 311 '''add partition data into infos attribute''' 312 if not partition is None and not partition in self.partition: 313 raise AnalysisError('partition is not a valid partition') 314 lenindex = self.iobj.lenindex 315 infosp = self.infos 316 if not partition and len(self.partition) > 0: 317 partition = self.partition[0] 318 if partition: 319 for i in partition: 320 infosp[i]['cat'] = 'primary' 321 infosp[i]['pparent'] = i 322 for i in range(lenindex): 323 if infosp[i]['cat'] == 'null': 324 util.pparent2(i, infosp) 325 if infosp[i]['pparent'] == -1 and partition: 326 infosp[i]['cat'] = 'variable' 327 else: 328 infosp[i]['cat'] = 'secondary' 329 for i in range(lenindex): 330 if infosp[i]['cat'] == 'coupled': 331 infosp[i]['pparent'] = infosp[infosp[i]['parent']]['pparent'] 332 333 def _setparent(self): 334 '''set parent (Field with minimal diff) for each Field''' 335 # parent : min(diff) -> child 336 # distparent : min(rateder) -> diffdistparent, rateder(rateA) 337 # minparent : min(distance) -> rate(rateB), distance 338 lenindex = self.iobj.lenindex 339 leniobj = len(self.iobj) 340 for i in range(lenindex): 341 mindiff = leniobj 342 ratedermin = 1 343 distancemin = leniobj * leniobj 344 distparent = None 345 minparent = None 346 parent = None 347 infoi = self.infos[i] 348 for j in range(lenindex): 349 matij = self.matrix[i][j] 350 if not infoi['cat'] in ['unique', 'coupled']: 351 if i != j and not i in self._listparent(j, 'parent') and \ 352 matij['typecoupl'] in ('coupled', 'derived') and \ 353 matij['diff'] < mindiff: 354 mindiff = matij['diff'] 355 parent = j 356 elif i != j and matij['typecoupl'] == 'crossed' and \ 357 self.infos[j]['cat'] != 'coupled': 358 infoi['crossed'].append(j) 359 if i != j and not i in self._listparent(j, 'distparent') and \ 360 matij['typecoupl'] in ('coupled', 'derived', 'linked', 'crossed') and \ 361 matij['rateder'] < ratedermin: 362 ratedermin = matij['rateder'] 363 distparent = j 364 if i != j and not i in self._listparent(j, 'minparent') and \ 365 matij['distance'] < distancemin and \ 366 infoi['lencodec'] <= self.infos[j]['lencodec'] and \ 367 self.infos[j]['cat'] != 'coupled': 368 distancemin = matij['distance'] 369 minparent = j 370 if not infoi['cat'] in ['unique', 'coupled']: 371 if not parent is None: 372 infoi['parent'] = parent 373 self.infos[parent]['child'].append(i) 374 if not distparent is None: 375 infoi['distparent'] = distparent 376 infoi['diffdistparent'] = self.matrix[i][distparent]['diff'] 377 infoi['rateder'] = self.matrix[i][distparent]['rateder'] 378 if not minparent is None: 379 infoi['minparent'] = minparent 380 infoi['distance'] = self.matrix[i][minparent]['distance'] 381 infoi['ratecpl'] = self.matrix[i][minparent]['ratecpl'] 382 else: 383 infoi['distance'] = leniobj - infoi['lencodec'] 384 return 385 386 def _listparent(self, idx, typeparent): 387 parent = idx 388 listparent = [] 389 while not parent is None and parent >= 0: 390 parent = self.infos[parent][typeparent] 391 if not parent is None and parent >= 0: 392 listparent.append(parent) 393 return listparent 394 395 def _dic_noeud(self, n, child, lname, mode): 396 '''generate a dict with nodes data defined by 'child' ''' 397 if n == -1: 398 lis = ['root-' + mode + '*(' + str(len(self.iobj)) + ')'] 399 else: 400 adding = '' 401 if mode == 'distance': 402 adding = str(self.infos[n]['distance']) + ' - ' 403 elif mode == 'diff': 404 adding = str(format(self.infos[n]['rateder'], '.2e')) + ' - ' 405 adding += str(self.infos[n]['lencodec']) 406 name = self.infos[n]['name'] + ' (' + adding + ')' 407 lis = [name.replace(' ', '*').replace("'", '*')] 408 if child[n+1]: 409 for ch in child[n+1]: 410 if ch != n: 411 lis.append(self._dic_noeud(ch, child, lname, mode)) 412 return {str(n).ljust(2, '*'): lis} 413 414 def _setgroups(self): 415 '''set groups (list of crossed Field groups)''' 416 self.groups = [] 417 crossed = {info['num'] for info in self.infos if info['crossed']} 418 remove = set() 419 for num in crossed: 420 for num2 in crossed: 421 if num != num2 and self.infos[num]['parent'] in crossed: 422 remove.add(num) 423 crossed -= remove 424 setcrossed = set() 425 for num in crossed: 426 info = self.infos[num] 427 if not info['name'] in setcrossed: 428 setname = {self.infos[cros]['name'] for cros in info['crossed'] 429 if cros in crossed} | {info['name']} 430 self.groups.append(setname) 431 setcrossed |= setname 432 return None 433 434 def _setpartition(self): 435 '''set partition (list of Field partitions)''' 436 brother = {idx['num']: idx['crossed'] 437 for idx in self.infos if idx['crossed']} 438 self.partition = [] 439 chemin = [] 440 for cros in brother: 441 chemin = [] 442 self._addchemin(chemin, cros, 1, brother) 443 childroot = [idx['num'] for idx in self.infos if idx['parent'] == -1 444 and idx['typecodec'] in ('complete', 'full')] 445 if childroot: 446 self.partition.append(childroot) 447 return None 448 449 def _addchemin(self, chemin, node, lchemin, brother): 450 '''extend 'chemin' with new nodes and add it to 'partition' ''' 451 if lchemin == len(self.iobj) and node == chemin[0] and \ 452 max(Counter(zip(*[self.iobj.lindex[idx].keys for idx in chemin])).values()) == 1: 453 part = sorted(chemin) 454 if not part in self.partition: 455 if not self.partition or len(part) > len(self.partition[0]): 456 self.partition.insert(0, part) 457 else: 458 self.partition.append(part) 459 if node in chemin[1:]: 460 return 461 lnode = self.infos[node]['lencodec'] 462 if lchemin * lnode <= len(self.iobj): 463 newchemin = chemin + [node] 464 for broth in brother[node]: 465 self._addchemin(newchemin, broth, lchemin * lnode, brother) 466 467 468class AnalysisError(Exception): 469 ''' Analysis Exception''' 470 # pass
20class Analysis: 21 '''This class analyses relationships included in a tabular object 22 (Pandas DataFrame, Dataset, Observation, list of list). 23 24 The Analysis class includes the following functions: 25 - identification and qualification of the relationships between Field, 26 - generation of the global properties of the structure 27 - data actualization based on structure updates 28 29 *Attributes* : 30 31 - **iobj** : Dataset or Observation associated to the Analysis object 32 - **hashi** : internal Id of the iobj 33 - **matrix** : square matrix with relationship properties between two fields 34 - **infos** : list of characteristics (matrix synthesis) 35 - **primary** : list of 'primary' fields row 36 - **secondary** : list of 'secondary' fields row 37 - **lvarname** : list of 'variable' fields name 38 39 The methods defined in this class are : 40 41 - `Analysis.actualize` 42 - `Analysis.actualize` 43 - `Analysis.check_relationship` 44 - `Analysis.getmatrix` 45 - `Analysis.getvarname` 46 - `Analysis.getsecondary` 47 - `Analysis.getprimary` 48 - `Analysis.getpartition` 49 - `Analysis.tree` 50 ''' 51 # %% methods 52 53 def __init__(self, iobj): 54 '''Analysis constructor. 55 56 *Parameters* 57 58 - **iobj** : object - tabular object (Pandas DataFrame, Dataset, Observation, 59 list of list) 60 61 Note: The Analysis data can be update only if tabular object is Dataset or 62 Observation. 63 ''' 64 if iobj.__class__.__name__ in ('Dataset', 'Observation', 'Ndataset', 'Sdataset'): 65 self.iobj = iobj 66 elif iobj.__class__.__name__ == 'DataFrame': 67 from observation import Sdataset 68 self.iobj = Sdataset(iobj) 69 else: 70 from dataset import Dataset 71 self.iobj = Dataset.obj(iobj) 72 self.hashi = None 73 self.matrix = None 74 self.infos = None 75 self.primary = None 76 self.secondary = None 77 self.lvarname = None 78 self.partition = [] 79 self.groups = [] 80 81 def actualize(self, partition=None): 82 ''' update all data with new values of iobj 83 84 *Parameters* 85 86 - **partition** : list of int (default None) - partition to be used ''' 87 self.matrix = self._setmatrix() 88 self._setinfos() 89 self._setparent() 90 self._setgroups() 91 self._setpartition() 92 self._setinfospartition(partition) 93 self.hashi = self.iobj._hashi() 94 self.lvarname = [idx['name'] 95 for idx in self.infos if idx['cat'] == 'variable'] 96 coupledvar = [idx['name'] for idx in self.infos if idx['cat'] == 'coupled' 97 and self.infos[idx['parent']]['cat'] == 'variable'] 98 self.lvarname += coupledvar 99 self.secondary = [idx['num'] 100 for idx in self.infos if idx['cat'] == 'secondary'] 101 coupledsec = [idx['num'] for idx in self.infos if idx['cat'] == 'coupled' 102 and self.infos[idx['parent']]['cat'] in ('primary', 'secondary')] 103 self.secondary += coupledsec 104 #infosidx = [idx for idx in self.infos if idx['cat'] != 'variable'] 105 infosidx = [idx for idx in self.infos if idx['cat'] != 'variable' and 106 not (idx['cat'] == 'coupled' and 107 self.infos[idx['parent']]['cat'] == 'variable') ] 108 self.primary = [infosidx.index(idx) 109 for idx in infosidx if idx['cat'] == 'primary'] 110 111 def check_relationship(self, relations): 112 '''get the list of inconsistent records for each relationship defined in relations 113 114 *Parameters* 115 116 - **relations** : list of dict - list of fields with relationship property 117 118 *Returns* : dict with for each relationship: key = pair of name, 119 and value = list of inconsistent records''' 120 if not isinstance(relations, (list, dict)): 121 raise AnalysisError("relations is not correct") 122 if isinstance(relations, dict): 123 relations = [relations] 124 dic_res = {} 125 for field in relations: 126 if not 'relationship' in field or not 'name' in field: 127 continue 128 if not 'parent' in field['relationship'] or not 'link' in field['relationship']: 129 raise AnalysisError("relationship is not correct") 130 rel = field['relationship']['link'] 131 f_parent = self.iobj.nindex(field['relationship']['parent']) 132 f_field = self.iobj.nindex(field['name']) 133 name_rel = field['name'] + ' - ' + field['relationship']['parent'] 134 if f_parent is None or f_field is None: 135 raise AnalysisError("field's name are not present in data") 136 match rel: 137 case 'derived': 138 dic_res[name_rel] = f_parent.coupling(f_field, reindex=True) 139 case 'coupled': 140 dic_res[name_rel] = f_parent.coupling(f_field, derived=False, reindex=True) 141 case _: 142 raise AnalysisError(rel + "is not a valid relationship") 143 return dic_res 144 145 def getinfos(self, keys=None): 146 '''return attribute infos 147 148 *Parameters* 149 150 - **keys** : string, list or tuple (default None) - list of attributes to returned 151 if 'all' or None, all attributes are returned 152 if 'struct', only structural attributes are returned''' 153 if self.hashi != self.iobj._hashi(): 154 self.actualize() 155 if keys == 'struct': 156 keys = ['num', 'name', 'cat', 'child', 'crossed', 'distparent', 157 'diffdistparent', 'parent', 'pparent', 'rateder', 'ratecpl'] 158 if not keys or keys == 'all': 159 return self.infos 160 return [{k: v for k, v in inf.items() if k in keys} for inf in self.infos] 161 162 def getmatrix(self, name=None): 163 '''return attribute matrix or only one value of the matrix defined by two names 164 165 *Parameters* 166 167 - **name** : list or tuple (default None) - list of two fields names 168 ''' 169 if self.hashi != self.iobj._hashi(): 170 self.actualize() 171 if not name or not isinstance(name, list): 172 return self.matrix 173 if name[0] in self.iobj.lname: 174 ind0 = self.iobj.lname.index(name[0]) 175 if len(name) == 1: 176 return self.matrix[ind0] 177 if len(name) > 1 and name[1] in self.iobj.lname: 178 return self.matrix[ind0][self.iobj.lname.index(name[1])] 179 return None 180 181 def getvarname(self): 182 '''return variable Field name''' 183 if self.hashi != self.iobj._hashi(): 184 self.actualize() 185 return self.lvarname 186 187 def getprimary(self): 188 '''return attribute primary''' 189 if self.hashi != self.iobj._hashi(): 190 self.actualize() 191 return self.primary 192 193 def getsecondary(self): 194 '''return attribute secondary''' 195 if self.hashi != self.iobj._hashi(): 196 self.actualize() 197 return self.secondary 198 199 def getpartition(self): 200 '''return attribute partition''' 201 if self.hashi != self.iobj._hashi(): 202 self.actualize() 203 return self.partition 204 205 def getgroups(self): 206 '''return attribute groups''' 207 if self.hashi != self.iobj._hashi(): 208 self.actualize() 209 return self.groups 210 211 def tree(self, mode='derived', width=5, lname=20, string=True): 212 '''return a string with a tree of derived Field. 213 214 *Parameters* 215 216 - **lname** : integer (default 20) - length of the names 217 - **width** : integer (default 5) - length of the lines 218 - **mode** : string (default 'derived') - kind of tree : 219 'derived' : derived tree 220 'distance': min distance tree 221 'diff': min dist rate tree 222 ''' 223 if mode == 'derived': 224 modeparent = 'parent' 225 elif mode == 'distance': 226 modeparent = 'minparent' 227 elif mode == 'diff': 228 modeparent = 'distparent' 229 else: 230 raise AnalysisError('mode is unknown') 231 if self.hashi != self.iobj._hashi(): 232 self.actualize() 233 child = [None] * (len(self.infos) + 1) 234 for i in range(len(self.infos)): 235 parent = self.infos[i][modeparent] 236 if child[parent + 1] is None: 237 child[parent + 1] = [] 238 child[parent + 1].append(i) 239 tr = self._dic_noeud(-1, child, lname, mode) 240 if string: 241 tre = pprint.pformat(tr, indent=0, width=width) 242 tre = tre.replace('---', ' - ') 243 tre = tre.replace(' ', ' ') 244 tre = tre.replace('*', ' ') 245 for c in ["'", "\"", "{", "[", "]", "}", ","]: 246 tre = tre.replace(c, "") 247 return tre 248 return tr 249 250 # %% internal methods 251 def _setmatrix(self): 252 '''set and return matrix attributes (coupling infos between each idx)''' 253 lenindex = self.iobj.lenindex 254 mat = [[None for i in range(lenindex)] for i in range(lenindex)] 255 for i in range(lenindex): 256 for j in range(i, lenindex): 257 mat[i][j] = self.iobj.lindex[i].couplinginfos( 258 self.iobj.lindex[j]) 259 for j in range(i): 260 mat[i][j] = copy(mat[j][i]) 261 if mat[i][j]['typecoupl'] == 'derived': 262 mat[i][j]['typecoupl'] = 'derive' 263 elif mat[i][j]['typecoupl'] == 'derive': 264 mat[i][j]['typecoupl'] = 'derived' 265 elif mat[i][j]['typecoupl'] == 'linked': 266 mat[i][j]['typecoupl'] = 'link' 267 elif mat[i][j]['typecoupl'] == 'link': 268 mat[i][j]['typecoupl'] = 'linked' 269 return mat 270 271 def _setinfos(self): 272 '''set and return attribute 'infos'. 273 Infos is an array with infos of each index : 274 - num, name, cat, child, crossed, distparent, diffdistparent, 275 parent, pparent, rateder''' 276 lenindex = self.iobj.lenindex 277 leniobj = len(self.iobj) 278 self.infos = [{} for i in range(lenindex)] 279 for i in range(lenindex): 280 self.infos[i]['num'] = i 281 self.infos[i]['name'] = self.iobj.lname[i] 282 self.infos[i]['cat'] = 'null' 283 self.infos[i]['parent'] = -1 284 self.infos[i]['distparent'] = -1 285 self.infos[i]['minparent'] = -1 286 self.infos[i]['pparent'] = -2 287 self.infos[i]['diffdistparent'] = -1 288 self.infos[i]['distance'] = leniobj * leniobj 289 self.infos[i]['ratecpl'] = 1 290 self.infos[i]['rateder'] = 1 291 self.infos[i]['child'] = [] 292 self.infos[i]['crossed'] = [] 293 self.infos[i] |= self.iobj.lindex[i].infos 294 if self.infos[i]['typecodec'] == 'unique': 295 self.infos[i]['pparent'] = -1 296 self.infos[i]['cat'] = 'unique' 297 self.infos[i]['diffdistparent'] = leniobj - 1 298 self.infos[i]['rateder'] = 0 299 for i in range(lenindex): 300 for j in range(i+1, lenindex): 301 if self.matrix[i][j]['typecoupl'] == 'coupled' and \ 302 self.infos[j]['parent'] == -1: 303 self.infos[j]['parent'] = i 304 self.infos[j]['distparent'] = i 305 self.infos[j]['diffdistparent'] = 0 306 self.infos[j]['rateder'] = 0 307 self.infos[j]['cat'] = 'coupled' 308 self.infos[i]['child'].append(j) 309 return 310 311 def _setinfospartition(self, partition=None): 312 '''add partition data into infos attribute''' 313 if not partition is None and not partition in self.partition: 314 raise AnalysisError('partition is not a valid partition') 315 lenindex = self.iobj.lenindex 316 infosp = self.infos 317 if not partition and len(self.partition) > 0: 318 partition = self.partition[0] 319 if partition: 320 for i in partition: 321 infosp[i]['cat'] = 'primary' 322 infosp[i]['pparent'] = i 323 for i in range(lenindex): 324 if infosp[i]['cat'] == 'null': 325 util.pparent2(i, infosp) 326 if infosp[i]['pparent'] == -1 and partition: 327 infosp[i]['cat'] = 'variable' 328 else: 329 infosp[i]['cat'] = 'secondary' 330 for i in range(lenindex): 331 if infosp[i]['cat'] == 'coupled': 332 infosp[i]['pparent'] = infosp[infosp[i]['parent']]['pparent'] 333 334 def _setparent(self): 335 '''set parent (Field with minimal diff) for each Field''' 336 # parent : min(diff) -> child 337 # distparent : min(rateder) -> diffdistparent, rateder(rateA) 338 # minparent : min(distance) -> rate(rateB), distance 339 lenindex = self.iobj.lenindex 340 leniobj = len(self.iobj) 341 for i in range(lenindex): 342 mindiff = leniobj 343 ratedermin = 1 344 distancemin = leniobj * leniobj 345 distparent = None 346 minparent = None 347 parent = None 348 infoi = self.infos[i] 349 for j in range(lenindex): 350 matij = self.matrix[i][j] 351 if not infoi['cat'] in ['unique', 'coupled']: 352 if i != j and not i in self._listparent(j, 'parent') and \ 353 matij['typecoupl'] in ('coupled', 'derived') and \ 354 matij['diff'] < mindiff: 355 mindiff = matij['diff'] 356 parent = j 357 elif i != j and matij['typecoupl'] == 'crossed' and \ 358 self.infos[j]['cat'] != 'coupled': 359 infoi['crossed'].append(j) 360 if i != j and not i in self._listparent(j, 'distparent') and \ 361 matij['typecoupl'] in ('coupled', 'derived', 'linked', 'crossed') and \ 362 matij['rateder'] < ratedermin: 363 ratedermin = matij['rateder'] 364 distparent = j 365 if i != j and not i in self._listparent(j, 'minparent') and \ 366 matij['distance'] < distancemin and \ 367 infoi['lencodec'] <= self.infos[j]['lencodec'] and \ 368 self.infos[j]['cat'] != 'coupled': 369 distancemin = matij['distance'] 370 minparent = j 371 if not infoi['cat'] in ['unique', 'coupled']: 372 if not parent is None: 373 infoi['parent'] = parent 374 self.infos[parent]['child'].append(i) 375 if not distparent is None: 376 infoi['distparent'] = distparent 377 infoi['diffdistparent'] = self.matrix[i][distparent]['diff'] 378 infoi['rateder'] = self.matrix[i][distparent]['rateder'] 379 if not minparent is None: 380 infoi['minparent'] = minparent 381 infoi['distance'] = self.matrix[i][minparent]['distance'] 382 infoi['ratecpl'] = self.matrix[i][minparent]['ratecpl'] 383 else: 384 infoi['distance'] = leniobj - infoi['lencodec'] 385 return 386 387 def _listparent(self, idx, typeparent): 388 parent = idx 389 listparent = [] 390 while not parent is None and parent >= 0: 391 parent = self.infos[parent][typeparent] 392 if not parent is None and parent >= 0: 393 listparent.append(parent) 394 return listparent 395 396 def _dic_noeud(self, n, child, lname, mode): 397 '''generate a dict with nodes data defined by 'child' ''' 398 if n == -1: 399 lis = ['root-' + mode + '*(' + str(len(self.iobj)) + ')'] 400 else: 401 adding = '' 402 if mode == 'distance': 403 adding = str(self.infos[n]['distance']) + ' - ' 404 elif mode == 'diff': 405 adding = str(format(self.infos[n]['rateder'], '.2e')) + ' - ' 406 adding += str(self.infos[n]['lencodec']) 407 name = self.infos[n]['name'] + ' (' + adding + ')' 408 lis = [name.replace(' ', '*').replace("'", '*')] 409 if child[n+1]: 410 for ch in child[n+1]: 411 if ch != n: 412 lis.append(self._dic_noeud(ch, child, lname, mode)) 413 return {str(n).ljust(2, '*'): lis} 414 415 def _setgroups(self): 416 '''set groups (list of crossed Field groups)''' 417 self.groups = [] 418 crossed = {info['num'] for info in self.infos if info['crossed']} 419 remove = set() 420 for num in crossed: 421 for num2 in crossed: 422 if num != num2 and self.infos[num]['parent'] in crossed: 423 remove.add(num) 424 crossed -= remove 425 setcrossed = set() 426 for num in crossed: 427 info = self.infos[num] 428 if not info['name'] in setcrossed: 429 setname = {self.infos[cros]['name'] for cros in info['crossed'] 430 if cros in crossed} | {info['name']} 431 self.groups.append(setname) 432 setcrossed |= setname 433 return None 434 435 def _setpartition(self): 436 '''set partition (list of Field partitions)''' 437 brother = {idx['num']: idx['crossed'] 438 for idx in self.infos if idx['crossed']} 439 self.partition = [] 440 chemin = [] 441 for cros in brother: 442 chemin = [] 443 self._addchemin(chemin, cros, 1, brother) 444 childroot = [idx['num'] for idx in self.infos if idx['parent'] == -1 445 and idx['typecodec'] in ('complete', 'full')] 446 if childroot: 447 self.partition.append(childroot) 448 return None 449 450 def _addchemin(self, chemin, node, lchemin, brother): 451 '''extend 'chemin' with new nodes and add it to 'partition' ''' 452 if lchemin == len(self.iobj) and node == chemin[0] and \ 453 max(Counter(zip(*[self.iobj.lindex[idx].keys for idx in chemin])).values()) == 1: 454 part = sorted(chemin) 455 if not part in self.partition: 456 if not self.partition or len(part) > len(self.partition[0]): 457 self.partition.insert(0, part) 458 else: 459 self.partition.append(part) 460 if node in chemin[1:]: 461 return 462 lnode = self.infos[node]['lencodec'] 463 if lchemin * lnode <= len(self.iobj): 464 newchemin = chemin + [node] 465 for broth in brother[node]: 466 self._addchemin(newchemin, broth, lchemin * lnode, brother)
This class analyses relationships included in a tabular object (Pandas DataFrame, Dataset, Observation, list of list).
The Analysis class includes the following functions:
- identification and qualification of the relationships between Field,
- generation of the global properties of the structure
- data actualization based on structure updates
Attributes :
- iobj : Dataset or Observation associated to the Analysis object
- hashi : internal Id of the iobj
- matrix : square matrix with relationship properties between two fields
- infos : list of characteristics (matrix synthesis)
- primary : list of 'primary' fields row
- secondary : list of 'secondary' fields row
- lvarname : list of 'variable' fields name
The methods defined in this class are :
53 def __init__(self, iobj): 54 '''Analysis constructor. 55 56 *Parameters* 57 58 - **iobj** : object - tabular object (Pandas DataFrame, Dataset, Observation, 59 list of list) 60 61 Note: The Analysis data can be update only if tabular object is Dataset or 62 Observation. 63 ''' 64 if iobj.__class__.__name__ in ('Dataset', 'Observation', 'Ndataset', 'Sdataset'): 65 self.iobj = iobj 66 elif iobj.__class__.__name__ == 'DataFrame': 67 from observation import Sdataset 68 self.iobj = Sdataset(iobj) 69 else: 70 from dataset import Dataset 71 self.iobj = Dataset.obj(iobj) 72 self.hashi = None 73 self.matrix = None 74 self.infos = None 75 self.primary = None 76 self.secondary = None 77 self.lvarname = None 78 self.partition = [] 79 self.groups = []
Analysis constructor.
Parameters
- iobj : object - tabular object (Pandas DataFrame, Dataset, Observation, list of list)
Note: The Analysis data can be update only if tabular object is Dataset or Observation.
81 def actualize(self, partition=None): 82 ''' update all data with new values of iobj 83 84 *Parameters* 85 86 - **partition** : list of int (default None) - partition to be used ''' 87 self.matrix = self._setmatrix() 88 self._setinfos() 89 self._setparent() 90 self._setgroups() 91 self._setpartition() 92 self._setinfospartition(partition) 93 self.hashi = self.iobj._hashi() 94 self.lvarname = [idx['name'] 95 for idx in self.infos if idx['cat'] == 'variable'] 96 coupledvar = [idx['name'] for idx in self.infos if idx['cat'] == 'coupled' 97 and self.infos[idx['parent']]['cat'] == 'variable'] 98 self.lvarname += coupledvar 99 self.secondary = [idx['num'] 100 for idx in self.infos if idx['cat'] == 'secondary'] 101 coupledsec = [idx['num'] for idx in self.infos if idx['cat'] == 'coupled' 102 and self.infos[idx['parent']]['cat'] in ('primary', 'secondary')] 103 self.secondary += coupledsec 104 #infosidx = [idx for idx in self.infos if idx['cat'] != 'variable'] 105 infosidx = [idx for idx in self.infos if idx['cat'] != 'variable' and 106 not (idx['cat'] == 'coupled' and 107 self.infos[idx['parent']]['cat'] == 'variable') ] 108 self.primary = [infosidx.index(idx) 109 for idx in infosidx if idx['cat'] == 'primary']
update all data with new values of iobj
Parameters
- partition : list of int (default None) - partition to be used
111 def check_relationship(self, relations): 112 '''get the list of inconsistent records for each relationship defined in relations 113 114 *Parameters* 115 116 - **relations** : list of dict - list of fields with relationship property 117 118 *Returns* : dict with for each relationship: key = pair of name, 119 and value = list of inconsistent records''' 120 if not isinstance(relations, (list, dict)): 121 raise AnalysisError("relations is not correct") 122 if isinstance(relations, dict): 123 relations = [relations] 124 dic_res = {} 125 for field in relations: 126 if not 'relationship' in field or not 'name' in field: 127 continue 128 if not 'parent' in field['relationship'] or not 'link' in field['relationship']: 129 raise AnalysisError("relationship is not correct") 130 rel = field['relationship']['link'] 131 f_parent = self.iobj.nindex(field['relationship']['parent']) 132 f_field = self.iobj.nindex(field['name']) 133 name_rel = field['name'] + ' - ' + field['relationship']['parent'] 134 if f_parent is None or f_field is None: 135 raise AnalysisError("field's name are not present in data") 136 match rel: 137 case 'derived': 138 dic_res[name_rel] = f_parent.coupling(f_field, reindex=True) 139 case 'coupled': 140 dic_res[name_rel] = f_parent.coupling(f_field, derived=False, reindex=True) 141 case _: 142 raise AnalysisError(rel + "is not a valid relationship") 143 return dic_res
get the list of inconsistent records for each relationship defined in relations
Parameters
- relations : list of dict - list of fields with relationship property
Returns : dict with for each relationship: key = pair of name, and value = list of inconsistent records
145 def getinfos(self, keys=None): 146 '''return attribute infos 147 148 *Parameters* 149 150 - **keys** : string, list or tuple (default None) - list of attributes to returned 151 if 'all' or None, all attributes are returned 152 if 'struct', only structural attributes are returned''' 153 if self.hashi != self.iobj._hashi(): 154 self.actualize() 155 if keys == 'struct': 156 keys = ['num', 'name', 'cat', 'child', 'crossed', 'distparent', 157 'diffdistparent', 'parent', 'pparent', 'rateder', 'ratecpl'] 158 if not keys or keys == 'all': 159 return self.infos 160 return [{k: v for k, v in inf.items() if k in keys} for inf in self.infos]
return attribute infos
Parameters
- keys : string, list or tuple (default None) - list of attributes to returned if 'all' or None, all attributes are returned if 'struct', only structural attributes are returned
162 def getmatrix(self, name=None): 163 '''return attribute matrix or only one value of the matrix defined by two names 164 165 *Parameters* 166 167 - **name** : list or tuple (default None) - list of two fields names 168 ''' 169 if self.hashi != self.iobj._hashi(): 170 self.actualize() 171 if not name or not isinstance(name, list): 172 return self.matrix 173 if name[0] in self.iobj.lname: 174 ind0 = self.iobj.lname.index(name[0]) 175 if len(name) == 1: 176 return self.matrix[ind0] 177 if len(name) > 1 and name[1] in self.iobj.lname: 178 return self.matrix[ind0][self.iobj.lname.index(name[1])] 179 return None
return attribute matrix or only one value of the matrix defined by two names
Parameters
- name : list or tuple (default None) - list of two fields names
181 def getvarname(self): 182 '''return variable Field name''' 183 if self.hashi != self.iobj._hashi(): 184 self.actualize() 185 return self.lvarname
return variable Field name
187 def getprimary(self): 188 '''return attribute primary''' 189 if self.hashi != self.iobj._hashi(): 190 self.actualize() 191 return self.primary
return attribute primary
193 def getsecondary(self): 194 '''return attribute secondary''' 195 if self.hashi != self.iobj._hashi(): 196 self.actualize() 197 return self.secondary
return attribute secondary
199 def getpartition(self): 200 '''return attribute partition''' 201 if self.hashi != self.iobj._hashi(): 202 self.actualize() 203 return self.partition
return attribute partition
205 def getgroups(self): 206 '''return attribute groups''' 207 if self.hashi != self.iobj._hashi(): 208 self.actualize() 209 return self.groups
return attribute groups
211 def tree(self, mode='derived', width=5, lname=20, string=True): 212 '''return a string with a tree of derived Field. 213 214 *Parameters* 215 216 - **lname** : integer (default 20) - length of the names 217 - **width** : integer (default 5) - length of the lines 218 - **mode** : string (default 'derived') - kind of tree : 219 'derived' : derived tree 220 'distance': min distance tree 221 'diff': min dist rate tree 222 ''' 223 if mode == 'derived': 224 modeparent = 'parent' 225 elif mode == 'distance': 226 modeparent = 'minparent' 227 elif mode == 'diff': 228 modeparent = 'distparent' 229 else: 230 raise AnalysisError('mode is unknown') 231 if self.hashi != self.iobj._hashi(): 232 self.actualize() 233 child = [None] * (len(self.infos) + 1) 234 for i in range(len(self.infos)): 235 parent = self.infos[i][modeparent] 236 if child[parent + 1] is None: 237 child[parent + 1] = [] 238 child[parent + 1].append(i) 239 tr = self._dic_noeud(-1, child, lname, mode) 240 if string: 241 tre = pprint.pformat(tr, indent=0, width=width) 242 tre = tre.replace('---', ' - ') 243 tre = tre.replace(' ', ' ') 244 tre = tre.replace('*', ' ') 245 for c in ["'", "\"", "{", "[", "]", "}", ","]: 246 tre = tre.replace(c, "") 247 return tre 248 return tr
return a string with a tree of derived Field.
Parameters
- lname : integer (default 20) - length of the names
- width : integer (default 5) - length of the lines
- mode : string (default 'derived') - kind of tree : 'derived' : derived tree 'distance': min distance tree 'diff': min dist rate tree
Analysis Exception
Inherited Members
- builtins.Exception
- Exception
- builtins.BaseException
- with_traceback
- args