Skip to content

Commit 420699a

Browse files
committed
Merge branch 'master' of https://github.com/biolink/kgx
2 parents f15f944 + c0f8411 commit 420699a

File tree

5 files changed

+142
-47
lines changed

5 files changed

+142
-47
lines changed

kgx/graph_operations/meta_knowledge_graph.py

Lines changed: 47 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -73,13 +73,13 @@ class MetaKnowledgeGraph:
7373
error_log = stderr
7474

7575
def __init__(
76-
self,
77-
name="",
78-
node_facet_properties: Optional[List] = None,
79-
edge_facet_properties: Optional[List] = None,
80-
progress_monitor: Optional[Callable[[GraphEntityType, List], None]] = None,
81-
error_log=None,
82-
**kwargs,
76+
self,
77+
name="",
78+
node_facet_properties: Optional[List] = None,
79+
edge_facet_properties: Optional[List] = None,
80+
progress_monitor: Optional[Callable[[GraphEntityType, List], None]] = None,
81+
error_log=None,
82+
**kwargs,
8383
):
8484
"""
8585
MetaKnowledgeGraph constructor.
@@ -213,8 +213,8 @@ def __init__(self, category_curie: str, mkg):
213213
Biolink Model category CURIE identifier.
214214
"""
215215
if not (
216-
_category_curie_regexp.fullmatch(category_curie)
217-
or category_curie == "unknown"
216+
_category_curie_regexp.fullmatch(category_curie)
217+
or category_curie == "unknown"
218218
):
219219
raise RuntimeError("Invalid Biolink category CURIE: " + category_curie)
220220

@@ -280,7 +280,7 @@ def get_count(self) -> int:
280280
return self.category_stats["count"]
281281

282282
def get_count_by_source(
283-
self, facet: str = "provided_by", source: str = None
283+
self, facet: str = "provided_by", source: str = None
284284
) -> Dict[str, Any]:
285285
"""
286286
Parameters
@@ -469,8 +469,24 @@ def _compile_triple_source_stats(self, triple: Tuple[str, str, str], data: Dict)
469469
data,
470470
)
471471

472+
@staticmethod
473+
def _normalize_relation_field(field) -> Set:
474+
# various non-string iterables...
475+
if isinstance(field, List) or \
476+
isinstance(field, Tuple) or \
477+
isinstance(field, Set):
478+
# eliminate duplicate terms
479+
# and normalize to a set
480+
return set(field)
481+
elif isinstance(field, str):
482+
# for uniformity, we coerce
483+
# to a set of one element
484+
return {field}
485+
else:
486+
raise TypeError(f"Unexpected KGX edge 'relation' data field of type '{type(field)}'")
487+
472488
def _process_triple(
473-
self, subject_category: str, predicate: str, object_category: str, data: Dict
489+
self, subject_category: str, predicate: str, object_category: str, data: Dict
474490
):
475491
# Process the 'valid' S-P-O triple here...
476492
triple = (subject_category, predicate, object_category)
@@ -484,11 +500,13 @@ def _process_triple(
484500
"count": 0,
485501
}
486502

487-
if (
488-
"relation" in data
489-
and data["relation"] not in self.association_map[triple]["relations"]
490-
):
491-
self.association_map[triple]["relations"].add(data["relation"])
503+
# patch for observed defect in some ETL's such as the July 2021 SRI Reference graph
504+
# in which the relation field ends up being a list of terms, sometimes duplicated
505+
506+
if "relation" in data:
507+
# input data["relation"] is normalized to a Set here
508+
data["relation"] = self._normalize_relation_field(data["relation"])
509+
self.association_map[triple]["relations"].update(data["relation"])
492510

493511
self.association_map[triple]["count"] += 1
494512

@@ -545,7 +563,6 @@ def analyse_edge(self, u, v, k, data) -> None:
545563
return
546564

547565
for obj_cat_idx in self.node_catalog[v]:
548-
549566
object_category: str = self.Category.get_category_curie_from_index(
550567
obj_cat_idx
551568
)
@@ -733,12 +750,12 @@ def get_total_edge_counts_across_mappings(self) -> int:
733750
return count
734751

735752
def get_edge_count_by_source(
736-
self,
737-
subject_category: str,
738-
predicate: str,
739-
object_category: str,
740-
facet: str = "knowledge_source",
741-
source: Optional[str] = None,
753+
self,
754+
subject_category: str,
755+
predicate: str,
756+
object_category: str,
757+
facet: str = "knowledge_source",
758+
source: Optional[str] = None,
742759
) -> Dict[str, Any]:
743760
"""
744761
Returns count by source for one S-P-O triple (S, O being Biolink categories; P, a Biolink predicate)
@@ -751,8 +768,8 @@ def get_edge_count_by_source(
751768
return dict()
752769
triple = (subject_category, predicate, object_category)
753770
if (
754-
triple in self.association_map
755-
and "count_by_source" in self.association_map[triple]
771+
triple in self.association_map
772+
and "count_by_source" in self.association_map[triple]
756773
):
757774
if facet in self.association_map[triple]["count_by_source"]:
758775
if source:
@@ -902,10 +919,10 @@ def save(self, file, name: str = None, file_format: str = "json") -> None:
902919
yaml.dump(stats, file)
903920

904921

905-
def generate_meta_knowledge_graph(graph: BaseGraph, name: str, filename: str) -> None:
922+
def generate_meta_knowledge_graph(graph: BaseGraph, name: str, filename: str, **kwargs) -> None:
906923
"""
907-
Generate a knowledge map that describes the composition of the graph
908-
and write to ``filename``.
924+
Generate a knowledge map that describes
925+
the composition of the graph and write to ``filename``.
909926
910927
Parameters
911928
----------
@@ -917,7 +934,7 @@ def generate_meta_knowledge_graph(graph: BaseGraph, name: str, filename: str) ->
917934
The file to write the knowledge map to
918935
919936
"""
920-
graph_stats = summarize_graph(graph, name)
937+
graph_stats = summarize_graph(graph, name, **kwargs)
921938
with open(filename, mode="w") as mkgh:
922939
dump(graph_stats, mkgh, indent=4, default=mkg_default)
923940

@@ -940,5 +957,5 @@ def summarize_graph(graph: BaseGraph, name: str = None, **kwargs) -> Dict:
940957
Dict
941958
A TRAPI 1.1 compliant meta knowledge graph of the knowledge graph returned as a dictionary.
942959
"""
943-
mkg = MetaKnowledgeGraph(name)
960+
mkg = MetaKnowledgeGraph(name, **kwargs)
944961
return mkg.summarize_graph(graph)
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
subject object predicate relation aggregator_knowledge_source
2+
HGNC:10848 NCBIGene:6469 biolink:interacts_with RO:0002434 biogrid
3+
HGNC:10848 HGNC:9398 biolink:interacts_with RO:0002434|RO:0002434|RO:0002434 string
4+
HGNC:10848 HGNC:9399 biolink:interacts_with RO:0002434 string
5+
HGNC:10848 HGNC:16265 biolink:interacts_with RO:0002434 string
6+
HGNC:10848 HGNC:16787 biolink:interacts_with RO:0002434 biogrid
7+
HGNC:10848 GO:0009986 biolink:part_of BFO:0000050 go
8+
HGNC:10848 GO:0097190 biolink:related_to RO:0002331|RO:0002327 go
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
id name category taxon
2+
HGNC:10848 SHH (human) biolink:Gene NCBITaxon:9606
3+
NCBIGene:6469 SHH biolink:Gene NCBITaxon:9606
4+
HGNC:9398 OLIG2 biolink:Gene NCBITaxon:9606
5+
HGNC:9399 PRKCD biolink:Gene NCBITaxon:9606
6+
HGNC:16265 WNT5B biolink:Gene NCBITaxon:9606
7+
HGNC:16466 SUFU biolink:Gene NCBITaxon:9606
8+
HGNC:16787 EDEM3 biolink:Gene NCBITaxon:9606
9+
GO:0009986 cell surface biolink:CellularComponent
10+
GO:0097190 apoptotic signaling pathway biolink:BiologicalProcess

tests/unit/test_cli_utils.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,15 @@ def test_graph_summary1():
4444
os.path.join(RESOURCE_DIR, "graph_edges.tsv"),
4545
]
4646
output = os.path.join(TARGET_DIR, "graph_stats1.yaml")
47-
summary_stats = graph_summary(inputs, "tsv", None, output, report_type="kgx-map")
47+
summary_stats = graph_summary(
48+
inputs,
49+
"tsv",
50+
None,
51+
output,
52+
node_facet_properties=["provided_by"],
53+
edge_facet_properties=["aggregator_knowledge_source"],
54+
report_type="kgx-map"
55+
)
4856

4957
assert os.path.exists(output)
5058
assert summary_stats
@@ -74,6 +82,8 @@ def test_graph_summary2a():
7482
None,
7583
output,
7684
report_type="meta-knowledge-graph",
85+
node_facet_properties=["provided_by"],
86+
edge_facet_properties=["aggregator_knowledge_source"],
7787
graph_name="Default Meta-Knowledge-Graph",
7888
)
7989

@@ -101,6 +111,8 @@ def test_graph_summary2b():
101111
None,
102112
output,
103113
report_type="meta-knowledge-graph",
114+
node_facet_properties=["provided_by"],
115+
edge_facet_properties=["aggregator_knowledge_source"],
104116
report_format="yaml",
105117
)
106118

@@ -126,6 +138,8 @@ def test_graph_summary2c():
126138
input_compression=None,
127139
output=output,
128140
report_type="meta-knowledge-graph",
141+
node_facet_properties=["provided_by"],
142+
edge_facet_properties=["aggregator_knowledge_source"],
129143
stream=True,
130144
)
131145

tests/unit/test_meta_knowledge_graph.py

Lines changed: 62 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,24 @@
1313
from tests import RESOURCE_DIR, TARGET_DIR
1414

1515

16+
def _check_mkg_json_contents(data):
17+
assert "NCBIGene" in data["nodes"]["biolink:Gene"]["id_prefixes"]
18+
assert "REACT" in data["nodes"]["biolink:Pathway"]["id_prefixes"]
19+
assert "HP" in data["nodes"]["biolink:PhenotypicFeature"]["id_prefixes"]
20+
assert data["nodes"]["biolink:Gene"]["count"] == 178
21+
assert len(data["nodes"]) == 8
22+
assert len(data["edges"]) == 13
23+
edge1 = data["edges"][0]
24+
assert edge1["subject"] == "biolink:Gene"
25+
assert edge1["predicate"] == "biolink:interacts_with"
26+
assert edge1["object"] == "biolink:Gene"
27+
assert edge1["count"] == 165
28+
edge1_cbs = edge1["count_by_source"]
29+
assert "aggregator_knowledge_source" in edge1_cbs
30+
edge1_cbs_aks = edge1_cbs["aggregator_knowledge_source"]
31+
assert edge1_cbs_aks["string"] == 159
32+
33+
1634
def test_generate_classical_meta_knowledge_graph():
1735
"""
1836
Test generate meta knowledge graph operation.
@@ -32,17 +50,15 @@ def test_generate_classical_meta_knowledge_graph():
3250
output_filename = os.path.join(TARGET_DIR, "test_meta_knowledge_graph-1.json")
3351

3452
generate_meta_knowledge_graph(
35-
transformer.store.graph, "Test Graph", output_filename
53+
graph=transformer.store.graph,
54+
name="Test Graph",
55+
filename=output_filename,
56+
edge_facet_properties=["aggregator_knowledge_source"]
3657
)
3758

3859
data = json.load(open(output_filename))
3960
assert data["name"] == "Test Graph"
40-
assert "NCBIGene" in data["nodes"]["biolink:Gene"]["id_prefixes"]
41-
assert "REACT" in data["nodes"]["biolink:Pathway"]["id_prefixes"]
42-
assert "HP" in data["nodes"]["biolink:PhenotypicFeature"]["id_prefixes"]
43-
assert data["nodes"]["biolink:Gene"]["count"] == 178
44-
assert len(data["nodes"]) == 8
45-
assert len(data["edges"]) == 13
61+
_check_mkg_json_contents(data)
4662

4763

4864
def test_generate_meta_knowledge_graph_by_stream_inspector():
@@ -61,7 +77,8 @@ def test_generate_meta_knowledge_graph_by_stream_inspector():
6177
transformer = Transformer(stream=True)
6278

6379
mkg = MetaKnowledgeGraph(
64-
"Test Graph - Streamed", edge_facet_properties=["aggregator_knowledge_source"]
80+
"Test Graph - Streamed",
81+
edge_facet_properties=["aggregator_knowledge_source"]
6582
)
6683

6784
# We configure the Transformer with a data flow inspector
@@ -97,6 +114,7 @@ def test_generate_meta_knowledge_graph_by_stream_inspector():
97114
assert len(ecbs1) == 2
98115
assert "biogrid" in ecbs1
99116
assert "string" in ecbs1
117+
assert ecbs1["string"] == 159
100118

101119
ecbs2 = mkg.get_edge_count_by_source(
102120
"biolink:Gene",
@@ -108,6 +126,7 @@ def test_generate_meta_knowledge_graph_by_stream_inspector():
108126
assert "omim" in ecbs2
109127
assert "orphanet" in ecbs2
110128
assert "hpoa" in ecbs2
129+
assert ecbs2["hpoa"] == 111
111130

112131

113132
#
@@ -155,7 +174,10 @@ def summary(self):
155174
monitor = ProgressMonitor()
156175

157176
mkg = MetaKnowledgeGraph(
158-
name="Test Graph - Streamed, Stats accessed via File", progress_monitor=monitor
177+
name="Test Graph - Streamed, Stats accessed via File",
178+
progress_monitor=monitor,
179+
node_facet_properties=["provided_by"],
180+
edge_facet_properties=["aggregator_knowledge_source"]
159181
)
160182

161183
t.transform(input_args=input_args, inspector=mkg)
@@ -166,13 +188,7 @@ def summary(self):
166188

167189
data = json.load(open(output_filename))
168190
assert data["name"] == "Test Graph - Streamed, Stats accessed via File"
169-
assert "NCBIGene" in data["nodes"]["biolink:Gene"]["id_prefixes"]
170-
assert "REACT" in data["nodes"]["biolink:Pathway"]["id_prefixes"]
171-
assert "HP" in data["nodes"]["biolink:PhenotypicFeature"]["id_prefixes"]
172-
assert data["nodes"]["biolink:Gene"]["count"] == 178
173-
assert len(data["nodes"]) == 8
174-
assert len(data["edges"]) == 13
175-
191+
_check_mkg_json_contents(data)
176192
monitor.summary()
177193

178194

@@ -225,3 +241,33 @@ def test_meta_knowledge_graph_multiple_category_and_predicate_parsing():
225241
assert mkg.get_edge_mapping_count() == 25
226242

227243
assert mkg.get_total_edge_counts_across_mappings() == 100
244+
245+
246+
def test_meta_knowledge_graph_of_complex_graph_data():
247+
"""
248+
Test generate meta knowledge graph operation.
249+
"""
250+
input_args = {
251+
"filename": [
252+
os.path.join(RESOURCE_DIR, "complex_graph_nodes.tsv"),
253+
os.path.join(RESOURCE_DIR, "complex_graph_edges.tsv"),
254+
],
255+
"format": "tsv",
256+
}
257+
258+
transformer = Transformer()
259+
260+
transformer.transform(input_args)
261+
262+
output_filename = os.path.join(TARGET_DIR, "test_meta_knowledge_graph-1.json")
263+
264+
generate_meta_knowledge_graph(
265+
graph=transformer.store.graph,
266+
name="Complex Test Graph",
267+
filename=output_filename,
268+
edge_facet_properties=["aggregator_knowledge_source"]
269+
)
270+
271+
data = json.load(open(output_filename))
272+
assert data["name"] == "Complex Test Graph"
273+
print(f"\n{json.dumps(data, indent=4)}")

0 commit comments

Comments
 (0)