@@ -73,13 +73,13 @@ class MetaKnowledgeGraph:
7373 error_log = stderr
7474
7575 def __init__ (
76- self ,
77- name = "" ,
78- node_facet_properties : Optional [List ] = None ,
79- edge_facet_properties : Optional [List ] = None ,
80- progress_monitor : Optional [Callable [[GraphEntityType , List ], None ]] = None ,
81- error_log = None ,
82- ** kwargs ,
76+ self ,
77+ name = "" ,
78+ node_facet_properties : Optional [List ] = None ,
79+ edge_facet_properties : Optional [List ] = None ,
80+ progress_monitor : Optional [Callable [[GraphEntityType , List ], None ]] = None ,
81+ error_log = None ,
82+ ** kwargs ,
8383 ):
8484 """
8585 MetaKnowledgeGraph constructor.
@@ -213,8 +213,8 @@ def __init__(self, category_curie: str, mkg):
213213 Biolink Model category CURIE identifier.
214214 """
215215 if not (
216- _category_curie_regexp .fullmatch (category_curie )
217- or category_curie == "unknown"
216+ _category_curie_regexp .fullmatch (category_curie )
217+ or category_curie == "unknown"
218218 ):
219219 raise RuntimeError ("Invalid Biolink category CURIE: " + category_curie )
220220
@@ -280,7 +280,7 @@ def get_count(self) -> int:
280280 return self .category_stats ["count" ]
281281
282282 def get_count_by_source (
283- self , facet : str = "provided_by" , source : str = None
283+ self , facet : str = "provided_by" , source : str = None
284284 ) -> Dict [str , Any ]:
285285 """
286286 Parameters
@@ -469,8 +469,24 @@ def _compile_triple_source_stats(self, triple: Tuple[str, str, str], data: Dict)
469469 data ,
470470 )
471471
472+ @staticmethod
473+ def _normalize_relation_field (field ) -> Set :
474+ # various non-string iterables...
475+ if isinstance (field , List ) or \
476+ isinstance (field , Tuple ) or \
477+ isinstance (field , Set ):
478+ # eliminate duplicate terms
479+ # and normalize to a set
480+ return set (field )
481+ elif isinstance (field , str ):
482+ # for uniformity, we coerce
483+ # to a set of one element
484+ return {field }
485+ else :
486+ raise TypeError (f"Unexpected KGX edge 'relation' data field of type '{ type (field )} '" )
487+
472488 def _process_triple (
473- self , subject_category : str , predicate : str , object_category : str , data : Dict
489+ self , subject_category : str , predicate : str , object_category : str , data : Dict
474490 ):
475491 # Process the 'valid' S-P-O triple here...
476492 triple = (subject_category , predicate , object_category )
@@ -484,11 +500,13 @@ def _process_triple(
484500 "count" : 0 ,
485501 }
486502
487- if (
488- "relation" in data
489- and data ["relation" ] not in self .association_map [triple ]["relations" ]
490- ):
491- self .association_map [triple ]["relations" ].add (data ["relation" ])
503+ # patch for observed defect in some ETL's such as the July 2021 SRI Reference graph
504+ # in which the relation field ends up being a list of terms, sometimes duplicated
505+
506+ if "relation" in data :
507+ # input data["relation"] is normalized to a Set here
508+ data ["relation" ] = self ._normalize_relation_field (data ["relation" ])
509+ self .association_map [triple ]["relations" ].update (data ["relation" ])
492510
493511 self .association_map [triple ]["count" ] += 1
494512
@@ -545,7 +563,6 @@ def analyse_edge(self, u, v, k, data) -> None:
545563 return
546564
547565 for obj_cat_idx in self .node_catalog [v ]:
548-
549566 object_category : str = self .Category .get_category_curie_from_index (
550567 obj_cat_idx
551568 )
@@ -733,12 +750,12 @@ def get_total_edge_counts_across_mappings(self) -> int:
733750 return count
734751
735752 def get_edge_count_by_source (
736- self ,
737- subject_category : str ,
738- predicate : str ,
739- object_category : str ,
740- facet : str = "knowledge_source" ,
741- source : Optional [str ] = None ,
753+ self ,
754+ subject_category : str ,
755+ predicate : str ,
756+ object_category : str ,
757+ facet : str = "knowledge_source" ,
758+ source : Optional [str ] = None ,
742759 ) -> Dict [str , Any ]:
743760 """
744761 Returns count by source for one S-P-O triple (S, O being Biolink categories; P, a Biolink predicate)
@@ -751,8 +768,8 @@ def get_edge_count_by_source(
751768 return dict ()
752769 triple = (subject_category , predicate , object_category )
753770 if (
754- triple in self .association_map
755- and "count_by_source" in self .association_map [triple ]
771+ triple in self .association_map
772+ and "count_by_source" in self .association_map [triple ]
756773 ):
757774 if facet in self .association_map [triple ]["count_by_source" ]:
758775 if source :
@@ -902,10 +919,10 @@ def save(self, file, name: str = None, file_format: str = "json") -> None:
902919 yaml .dump (stats , file )
903920
904921
905- def generate_meta_knowledge_graph (graph : BaseGraph , name : str , filename : str ) -> None :
922+ def generate_meta_knowledge_graph (graph : BaseGraph , name : str , filename : str , ** kwargs ) -> None :
906923 """
907- Generate a knowledge map that describes the composition of the graph
908- and write to ``filename``.
924+ Generate a knowledge map that describes
925+ the composition of the graph and write to ``filename``.
909926
910927 Parameters
911928 ----------
@@ -917,7 +934,7 @@ def generate_meta_knowledge_graph(graph: BaseGraph, name: str, filename: str) ->
917934 The file to write the knowledge map to
918935
919936 """
920- graph_stats = summarize_graph (graph , name )
937+ graph_stats = summarize_graph (graph , name , ** kwargs )
921938 with open (filename , mode = "w" ) as mkgh :
922939 dump (graph_stats , mkgh , indent = 4 , default = mkg_default )
923940
@@ -940,5 +957,5 @@ def summarize_graph(graph: BaseGraph, name: str = None, **kwargs) -> Dict:
940957 Dict
941958 A TRAPI 1.1 compliant meta knowledge graph of the knowledge graph returned as a dictionary.
942959 """
943- mkg = MetaKnowledgeGraph (name )
960+ mkg = MetaKnowledgeGraph (name , ** kwargs )
944961 return mkg .summarize_graph (graph )
0 commit comments