-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_provenance.py
59 lines (58 loc) · 3.95 KB
/
get_provenance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import numpy as np
import pandas as pd
import requests
import time
from datetime import datetime
def get_provenance(seed_paper):
"""
Given seed paper (url), generate nodes (dict) and edge list (dataframe) for its parents and grandparents.
"""
progress_bar = st.progress(0)
nodes = dict()
# Get metadata and references of seed paper
http = requests.get("https://api.semanticscholar.org/graph/v1/paper/URL:%s?fields=title,year,publicationDate,journal,authors,url,references.title,references.publicationDate,references.year,references.journal,references.authors,references.url,embedding" %seed_paper)
if http.status_code == 429:
print("Waiting 5 Minutes for access to the API...")
time.sleep(300)
http = requests.get("https://api.semanticscholar.org/graph/v1/paper/URL:%s?fields=title,year,publicationDate,journal,authors,url,references.title,references.publicationDate,references.year,references.journal,references.authors,references.url,embedding" %seed_paper)
json = http.json()
# Put seed paper metadata into nodes dict
nodes[json['paperId']] = [json['title'], datetime.strptime(json['publicationDate'], '%Y-%m-%d'), json['year'], json['journal'], json['authors'], json['url'], json['embedding']['vector']]
# Put corpus-listed references into nodes dict
references_df = pd.DataFrame(json['references']).dropna()
for index, row in references_df.iterrows():
nodes[row['paperId']] = [row['title'], datetime.strptime(row['publicationDate'], '%Y-%m-%d'), row['year'], row['journal'], row['authors'], row['url']]
# Make edges list with corpus-listed references
edges = pd.DataFrame({"referencing": [json['paperId']]*len(references_df),
"referenced": references_df['paperId']})
progress_bar.progress(1/len(references_df.index))
# For each reference, get its references and their metadata, and add them to the dicts
for index, row in references_df.iterrows():
progress_bar.progress((index+1)/len(references_df.index))
# Get metadata and references of referenced paper
temp_http = requests.get("https://api.semanticscholar.org/graph/v1/paper/%s?fields=title,year,publicationDate,journal,authors,url,references.title,references.publicationDate,references.year,references.journal,references.authors,references.url" %row['paperId'])
if temp_http.status_code == 429:
print("Waiting 5 Minutes for access to the SemanticScholar API...")
time.sleep(300)
temp_http = requests.get("https://api.semanticscholar.org/graph/v1/paper/%s?fields=title,year,publicationDate,journal,authors,url,references.title,references.publicationDate,references.year,references.journal,references.authors,references.url" %row['paperId'])
if temp_http.status_code == 404:
continue
temp_json = temp_http.json()
## no need to put referenced paper metadata into nodes dict
# Put corpus-listed reference references into nodes dict
temp_references_df = pd.DataFrame(temp_json['references']).dropna()
if len(temp_references_df) == 0:
continue
for i, r in temp_references_df.iterrows():
nodes[r['paperId']] = [r['title'], datetime.strptime(r['publicationDate'], '%Y-%m-%d'), r['year'], r['journal'], r['authors'], r['url']]
# Make edges list with corpus-listed reference references, and append to main edge list
temp_edges = pd.DataFrame({"referencing": [temp_json['paperId']]*len(temp_references_df),
"referenced": temp_references_df['paperId']})
edges = pd.concat([edges, temp_edges])
json[row['paperId']] = temp_json
edges = edges.set_index("referencing").reset_index()
# column: total times referenced
edges['total_refs'] = edges.groupby(['referenced'])['referencing'].transform('count')
# column: referencing is seed paper
edges['direct_ref'] = edges['referencing'] == edges.iloc[0,0]
return nodes, edges