-
Notifications
You must be signed in to change notification settings - Fork 6
Description
Summary
Provenance graphs for BRs have wrong information or no information at all about the primary source of the BR. For what concerns the wrong primary source, OpenAlex is mentioned as source for the BR (this is impossible, since only IDs have been ingested from OpenAlex).
Incorrect or missing values for prov:hadPrimarySource property in provenance of Bibliographic Resources
In v.7 of OpenCitations Meta RDF dataset (https://doi.org/10.6084/m9.figshare.21747536.v7), the provenance data for Bibliographic Resources (BR) stores wrong information or no information at all for what concerns the primary source. More specifically:
- 85,962,741 provenance snapshots for BRs appear to have OpenAlex as the BR’s primary source, which is not possible since only Identifier entities have been ingested into Meta from OpenAlex.
- the only other primary source for BRs appears to be Crossref (https://api.crossref.org/snapshots/monthly/2024/03/all.json.tar.gz is mentioned as primary source in 3,239,679 snapshots) and no other data source (e.g. DataCite) is present.
- 58,537,740 snapshots have no information about the primary source, and 28,678,025 BRs have no primary source mentioned in any of their associated provenance snapshots.
Consider for example the provenance graph for https://w3id.org/oc/meta/br/061903839851, which shows OpenAlex as primary source:
{
"@graph":[
{
"@id":"https://w3id.org/oc/meta/br/061903839851/prov/se/1",
"@type":[
"http://www.w3.org/ns/prov#Entity"
],
"http://purl.org/dc/terms/description":[
{
"@value":"The entity 'https://w3id.org/oc/meta/br/061903839851' has been created."
}
],
"http://www.w3.org/ns/prov#generatedAtTime":[
{
"@type":"http://www.w3.org/2001/XMLSchema#dateTime",
"@value":"2023-12-13T15:02:38.178014"
},
{
"@type":"http://www.w3.org/2001/XMLSchema#dateTime",
"@value":"2024-03-28T08:21:51+00:00"
}
],
"http://www.w3.org/ns/prov#hadPrimarySource":[
{
"@id":"https://openalex.s3.amazonaws.com/browse.html"
}
],
"http://www.w3.org/ns/prov#specializationOf":[
{
"@id":"https://w3id.org/oc/meta/br/061903839851"
}
],
"http://www.w3.org/ns/prov#wasAttributedTo":[
{
"@id":"https://w3id.org/oc/meta/prov/pa/1"
}
]
}
],
"@id":"https://w3id.org/oc/meta/br/061903839851/prov/"
}Or the provenance graph for https://w3id.org/oc/meta/br/0680731459, lacking any primary source:
{
"@graph":[
{
"@id":"https://w3id.org/oc/meta/br/0680731459/prov/se/1",
"@type":[
"http://www.w3.org/ns/prov#Entity"
],
"http://purl.org/dc/terms/description":[
{
"@value":"The entity 'https://w3id.org/oc/meta/br/0680731459' has been created."
}
],
"http://www.w3.org/ns/prov#generatedAtTime":[
{
"@type":"http://www.w3.org/2001/XMLSchema#dateTime",
"@value":"2023-12-13T14:54:00.805110"
}
],
"http://www.w3.org/ns/prov#specializationOf":[
{
"@id":"https://w3id.org/oc/meta/br/0680731459"
}
],
"http://www.w3.org/ns/prov#wasAttributedTo":[
{
"@id":"https://w3id.org/oc/meta/prov/pa/1"
}
]
}
],
"@id":"https://w3id.org/oc/meta/br/0680731459/prov/"
}Reproducing the results of the analysis
The following script can be used to get the number of BRs derived from each primary source in OC Meta provenance RDF files, the number of provenance snapshots missing information about the primary source, the number of BRs without any associated primary source and the total number of BR provenance graphs (working with the decompressed RDF dump).
from collections import defaultdict
from tqdm import tqdm
import os
from zipfile import ZipFile
import json
def get_br_primsource_count(data_dir):
"""
Get the number of BRs derived from each primary source in OC Meta provenance RDF files,
the number of provenance snapshots missing information about the primary source,
the number of BRs without any associated primary source and the total number of BR provenance graphs.
:param data_dir: Path to the directory containing the decompressed provenance archive.
:type data_dir: str
:return: A tuple containing:
- A dictionary with the count of BRs for each primary source.
- The number of snapshots missing primary source information.
- The number of BRs without any associated primary source.
- The total number of BRs.
:rtype: tuple(dict, int, int, int)
"""
source_count = defaultdict(int)
no_primsource_snaphots_count = 0
no_primsource_br_count = 0
total_br_count = 0
fpaths = set()
for dirpath, _, filenames in os.walk(data_dir):
if os.path.basename(dirpath) == 'prov':
for fn in filenames:
fpaths.add(os.path.join(dirpath,fn))
for fp in tqdm(fpaths):
with ZipFile(fp) as archive:
with archive.open('se.json') as f:
data: list = json.load(f)
for br_prov_g in data:
total_br_count += 1
no_primsource_br = True
for snapshot_g in br_prov_g['@graph']:
if snapshot_g.get('http://www.w3.org/ns/prov#hadPrimarySource'):
no_primsource_br = False # the BR provenance graph has at least one snapshot specifying the primary source
for primary_source_g in snapshot_g['http://www.w3.org/ns/prov#hadPrimarySource']:
source_for_snapshot = primary_source_g['@id']
source_count[source_for_snapshot] += 1
else:
no_primsource_snaphots_count += 1
if no_primsource_br:
no_primsource_br_count +=1
print(f'Primary sources distribution: {dict(source_count)}\n\n',
f'Snapshots without primary source: {no_primsource_snaphots_count}\n\n',
f'BRs without primary source: {no_primsource_br_count}\n\n',
f'Total BR count: {total_br_count}')
return dict(source_count), no_primsource_snaphots_count, no_primsource_br_count, total_br_count
path_to_decompressed_data_root = 'E:/br_test/br'
print(get_br_primsource_count(path_to_decompressed_data_root))Which outputs the following results with Meta RDF dump v.7:
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 116716/116716 [34:54<00:00, 55.73it/s]
Primary sources distribution: {'https://openalex.s3.amazonaws.com/browse.html': 85962741, 'https://api.crossref.org/snapshots/monthly/2024/03/all.json.tar.gz': 3239679}
Snapshots without primary source: 58537740
BRs without primary source: 28678025
Total BR count: 116661731To get the JSON-LD-serialised provenance graphs of specific BRs, the following script can be used.
import os
from zipfile import ZipFile
import json
def get_provenance_graph(entity_iri:str, data_root:str) -> dict:
"""
Uses the entity's IRI (i.e. its OMID) and finds the exact
path of the file storing its provenance graph in a subdirectory of data_root.
Then, it reads the file and returns the provenance graph as a dictionary.
param entity_iri: The IRI of the entity whose provenance graph is to be retrieved.
param data_root: The path to the root directory storing the provenance data, i.e. the folder resulting from decompression of a .tar.gz file.
return: The provenance graph of the entity as a dictionary.
"""
digits = entity_iri.split('/')[-1]
supplier_prefix = digits[:digits.find('0', 1)+1]
sequential_number = int(digits.removeprefix(supplier_prefix))
for dir in os.listdir(data_root):
if dir == supplier_prefix:
dir1_path = os.path.join(data_root, dir)
for subdir in sorted(os.listdir(dir1_path), key=lambda x: int(x)):
if sequential_number < int(subdir):
dir2_path = os.path.join(dir1_path, subdir)
for subsubdir in sorted([d for d in os.listdir(dir2_path) if d.isdigit()], key=lambda x: int(x)):
if sequential_number < int(subsubdir):
dir3_path = os.path.join(dir2_path, subsubdir)
prov_dir_path = os.path.join(dir3_path, 'prov')
with ZipFile(os.path.join(prov_dir_path, 'se.zip')) as archive:
with archive.open('se.json') as f:
data: list = json.load(f)
for obj in data:
if obj['@id'] == entity_iri + '/prov/':
return obj
break
break
return None
example_brs = ['[https://w3id.org/oc/meta/br/061903839851](https://w3id.org/oc/meta/br/061903839851/prov/)', '[https://w3id.org/oc/meta/br/0680731459](https://w3id.org/oc/meta/br/0680731459/prov/)']
path_to_decompressed_data_root = 'E:/br_test/br'
print(get_provenance_graph(example_brs, path_to_decompressed_data_root))