Skip to content

Commit 81fa979

Browse files
committed
migrating from MAG to openalex
1 parent f8eed90 commit 81fa979

9 files changed

+94
-88
lines changed

README.md

+6-7
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,22 @@
11
# citations
2-
Citation network analysis using [Microsoft Academic](https://www.microsoft.com/en-us/research/project/academic-knowledge/) and [3d-force-graph](https://github.com/vasturiano/3d-force-graph)
2+
Citation network analysis using [OpenAlex](https://openalex.org/) and [3d-force-graph](https://github.com/vasturiano/3d-force-graph)
33

44
### Installation
55

66
`sudo pip3 install -r requirements.txt`
77

88
### Fetching data
99

10-
`./fetch_author.py AUTHOR_ID`
10+
`./fetch_author.py AUTHOR_NAME`
1111
`./fetch_paper.py PAPER_ID`
1212

1313
### Sample datasets
1414

1515
#### Authors
16-
[Daniel J. Exeter](https://uoa-eresearch.github.io/citations/?data=Dan) 1083 nodes 2153 links
17-
[David O'Sullivan](https://uoa-eresearch.github.io/citations/?data=David) 4517 nodes 14225 links
18-
[Giovanni Coco](https://uoa-eresearch.github.io/citations/?data=Giovanni) 2063 nodes 10293 links
19-
[Mark Gahegan](https://uoa-eresearch.github.io/citations/?data=Mark) 2749 nodes 8977 links
20-
[Quinn Asena](https://uoa-eresearch.github.io/citations/?data=Quinn) 12 nodes 15 links
16+
[Daniel J. Exeter](https://uoa-eresearch.github.io/citations/?data=Dan) 881 nodes 1732 links
17+
[Giovanni Coco](https://uoa-eresearch.github.io/citations/?data=Giovanni) 5459 nodes 40975 links
18+
[Mark Gahegan](https://uoa-eresearch.github.io/citations/?data=Mark) 2847 nodes 8751 links
19+
[Quinn Asena](https://uoa-eresearch.github.io/citations/?data=Quinn) 22 nodes 24 links
2120

2221
#### Papers
2322
[Decolonizing Metholodologies](https://uoa-eresearch.github.io/citations/?data=Decolonizing_Methodologies) 6036 nodes 7254 links

data/Dan.json

+1-1
Large diffs are not rendered by default.

data/David.json

-1
This file was deleted.

data/Giovanni.json

+1-1
Large diffs are not rendered by default.

data/Mark.json

+1-1
Large diffs are not rendered by default.

data/Quinn.json

+1-1
Large diffs are not rendered by default.

fetch_author.py

+18-3
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,17 @@
66
from pprint import pprint
77
import sys
88
from tqdm.auto import tqdm
9+
import time
910

10-
id = sys.argv[1]
11+
authors = {
12+
"Dan": "A4352911678",
13+
"Mark": "A4341954224",
14+
"Giovanni": "A4358613644",
15+
"Quinn": "A2899969917"
16+
}
17+
18+
author = sys.argv[1]
19+
id = authors[author]
1120

1221
params = {
1322
"per-page": 200,
@@ -22,12 +31,18 @@
2231
citing_works = []
2332

2433
for w in tqdm(seed_works):
25-
response = requests.get(w["cited_by_api_url"], params=params).json()
34+
while True:
35+
try:
36+
response = requests.get(w["cited_by_api_url"], params=params).json()
37+
except Exception as e:
38+
print(f'Got {e} when requesting {w["cited_by_api_url"]}, retrying')
39+
time.sleep(1)
40+
break
2641
pprint(response["meta"])
2742
citing_works.extend(response["results"])
2843

2944
works = seed_works + citing_works
3045
print(len(works))
3146

32-
with open("data.json", "w") as f:
47+
with open(f"data/{author}.json", "w") as f:
3348
json.dump(works, f)

index.html

+54-73
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
integrity="sha512-yocoLferfPbcwpCMr8v/B0AB4SWpJlouBwgE0D3ZHaiP1nuu5djZclFEIj9znuqghaZ3tdCMRrreLoM8km+jIQ=="
2020
crossorigin="anonymous"></script>
2121
<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
22+
<script src="https://cdnjs.cloudflare.com/ajax/libs/lodash.js/4.17.21/lodash.min.js" referrerpolicy="no-referrer"></script>
2223
<title>Citation network explorer</title>
2324
<style>
2425
html,
@@ -105,7 +106,7 @@
105106
</head>
106107

107108
<body>
108-
<h1 id="title"><span id="subtitle"><span id="name">Giovanni</span>'s citation network</span> <span id="year"></span>
109+
<h1 id="title"><span id="subtitle"><span id="name">Dan</span>'s citation network</span> <span id="year"></span>
109110
</h1>
110111
<div id="network"></div>
111112
<select id="search" placeholder="type to search..." class="form-control autocomplete"></select>
@@ -114,15 +115,15 @@ <h1 id="title"><span id="subtitle"><span id="name">Giovanni</span>'s citation ne
114115
<div class="small">Hover over the timeline to filter the network graph by year<br>Click to hold</div>
115116
Colour by
116117
<select id="color">
117-
<option value="citation_count" selected>Citation count</option>
118+
<option value="cited_by_count" selected>Citation count</option>
118119
<option value="n_authors">Number of authors</option>
119120
<option value="n_inst">Number of institutions</option>
120121
<option value="field">Primary field</option>
121122
</select><div id="legend"></div><br>
122123
Size by
123124
<select id="size">
124125
<option value="">Constant</option>
125-
<option value="citation_count" selected>Citation count</option>
126+
<option value="cited_by_count" selected>Citation count</option>
126127
<option value="n_authors">Number of authors</option>
127128
<option value="n_inst">Number of institutions</option>
128129
<option value="n_fields">Number of fields</option>
@@ -310,10 +311,9 @@ <h5 id="modal-title" class="modal-title" id="exampleModalLongTitle"></h5>
310311
var visible_nodes = nodes.map(n => n.id)
311312
var edges = [];
312313
for (var node of nodes) {
313-
for (var rid in node.references) {
314-
rid = parseInt(rid);
314+
for (var rid of node.referenced_works) {
315315
if (visible_nodes.includes(rid)) {
316-
edges.push({ source: node.id, target: rid, count: node.references[rid] })
316+
edges.push({ source: node.id, target: rid })
317317
}
318318
}
319319
}
@@ -322,7 +322,7 @@ <h5 id="modal-title" class="modal-title" id="exampleModalLongTitle"></h5>
322322
const urlParams = new URLSearchParams(window.location.search);
323323
var data = urlParams.get('data');
324324
if (!data) {
325-
data = "data/Giovanni.json"
325+
data = "data/Dan.json"
326326
} else if (data == "Decolonizing_Methodologies") {
327327
$("#subtitle").text("Influence of Decolonizing Methodologies")
328328
data = "Decolonizing_Methodologies/data.json"
@@ -333,7 +333,7 @@ <h5 id="modal-title" class="modal-title" id="exampleModalLongTitle"></h5>
333333
function summarise_by_year(data) {
334334
var time_data = {}
335335
for (var node of data) {
336-
var d = node.date_published.slice(0, 4)
336+
var d = node.publication_year;
337337
if (!time_data[d]) time_data[d] = 0;
338338
time_data[d]++;
339339
}
@@ -343,26 +343,15 @@ <h5 id="modal-title" class="modal-title" id="exampleModalLongTitle"></h5>
343343
console.log(data);
344344
window.data = data;
345345
window.current_data = data;
346+
// Used for grouping
346347
window.paper_lookup = {}
348+
// Quick built in summary stats for colorscale/size changes
347349
for (var node of data) {
348350
paper_lookup[node.id] = node
349-
var tmp = {}
350-
for (var rid of node.references) {
351-
tmp[rid] = 1
352-
}
353-
node.references = tmp;
354-
if (node.authors) {
355-
node.n_authors = node.authors.length;
356-
node.n_inst = uniq(node.authors.map(a => a.DAfN)).length;
357-
}
358-
if (node.fields) {
359-
node.field = node.fields[0].DFN;
360-
node.n_fields = node.fields.length;
361-
}
362-
if (node.journal) {
363-
node.journal.JN = node.journal.JN.toUpperCase()
364-
node.journals = [node.journal]
365-
}
351+
node.n_authors = node.authorships.length;
352+
node.n_inst = uniq(node.authorships.map(a => a.institutions.length ? a.institutions[0].display_name : "")).length
353+
node.n_fields = node.concepts.length
354+
node.field = node.concepts[0].display_name
366355
}
367356

368357
var time_data = summarise_by_year(data);
@@ -393,7 +382,7 @@ <h5 id="modal-title" class="modal-title" id="exampleModalLongTitle"></h5>
393382
}
394383

395384
Plotly.newPlot('timeline', plotData, layout);
396-
window.year = 2021
385+
window.year = 2023
397386
window.freeze = false;
398387
$("#timeline").on('plotly_hover', function (event, data) {
399388
if (window.freeze) return;
@@ -410,7 +399,7 @@ <h5 id="modal-title" class="modal-title" id="exampleModalLongTitle"></h5>
410399
if (window.year == d.x) return;
411400
window.year = d.x;
412401
console.log(year)
413-
var filtered_data = window.current_data.filter(n => n.date_published.slice(0, 4) <= year)
402+
var filtered_data = window.current_data.filter(n => n.publication_year <= year)
414403
console.log(filtered_data)
415404
$("#year").text("as at " + year);
416405
graph.graphData({
@@ -446,7 +435,7 @@ <h5 id="modal-title" class="modal-title" id="exampleModalLongTitle"></h5>
446435
console.log(d)
447436
if (window.year == d.x) return;
448437
window.year = d.x
449-
var filtered_data = window.current_data.filter(n => n.date_published.slice(0, 4) <= year)
438+
var filtered_data = window.current_data.filter(n => n.publication_year <= year)
450439
console.log(filtered_data)
451440
$("#year").text("as at " + year);
452441
graph.graphData({
@@ -455,17 +444,18 @@ <h5 id="modal-title" class="modal-title" id="exampleModalLongTitle"></h5>
455444
})
456445
});
457446

458-
var scale = colorscale(data.map(n => n.citation_count))
447+
var scale = colorscale(data.map(n => n.cited_by_count))
459448

460449
function generateHTMLTooltip(node, include_title = true) {
450+
console.log(node)
461451
result = ""
462452
if (include_title) {
463-
result += `<b>${node.title} (${node.date_published})</b><br>`;
453+
result += `<b>${node.title} (${node.publication_date})</b><br>`;
464454
}
465-
result += `${node.authors.map(a => a.DAfN ? a.DAuN + " (" + a.DAfN + ")" : a.DAuN).join(",")}<br>
466-
${node.journal ? node.journal.JN : ""}<br>
467-
${node.fields.map(f => f.DFN).join(",")}<br>
468-
Citation count: ${node.citation_count}<br>`
455+
result += `${node.authorships.map(a => a.institutions.length ? a.author.display_name + " (" + a.institutions[0].display_name + ")" : a.author.display_name).join(",")}<br>
456+
${node.primary_location.source ? node.primary_location.source.display_name : node.primary_location.landing_page_url}<br>
457+
${node.concepts.map(c => c.display_name).join(",")}<br>
458+
Citation count: ${node.cited_by_count}<br>`
469459
return result;
470460
}
471461

@@ -476,8 +466,8 @@ <h5 id="modal-title" class="modal-title" id="exampleModalLongTitle"></h5>
476466
links: getEdges(data)
477467
})
478468
.enableNodeDrag(false)
479-
.nodeVal('citation_count')
480-
.nodeColor(n => scale(n.citation_count).css())
469+
.nodeVal('cited_by_count')
470+
.nodeColor(n => scale(n.cited_by_count).css())
481471
.nodeLabel(node => `<div class="nodetooltip">${generateHTMLTooltip(node)}</div>`)
482472
.onNodeHover(node => elem.style.cursor = node ? 'pointer' : null)
483473
.onNodeClick(node => {
@@ -491,17 +481,10 @@ <h5 id="modal-title" class="modal-title" id="exampleModalLongTitle"></h5>
491481
3000 // ms transition duration
492482
);
493483
$(".nodetooltip").remove()
494-
var doilink = "https://doi.org/" + node.DOI;
495-
var links = [];
496-
for (var s of node.source) {
497-
s.domain = (new URL(s.U)).hostname.replace('www.', '')
498-
links.push(`<a href="${s.U}">${s.domain}</a>`);
499-
}
500-
links = links.join(" | ")
501-
$("#modal-title").text(`${node.title} (${node.date_published}))`);
484+
var doilink = node.doi;
485+
$("#modal-title").text(`${node.title} (${node.publication_date}))`);
502486
$("#modal-body").html(`${generateHTMLTooltip(node, false)}
503-
<a href="${doilink}">${doilink}</a><br>
504-
${links}<br>`);
487+
<a href="${doilink}">${doilink}</a><br>`);
505488
$("#modal").modal('show')
506489
})
507490
$("#spinner").hide();
@@ -527,58 +510,56 @@ <h5 id="modal-title" class="modal-title" id="exampleModalLongTitle"></h5>
527510
window.group = this.value;
528511
if (this.value == "paper") {
529512
window.current_data = data;
530-
var filtered_data = data.filter(n => n.date_published.slice(0, 4) <= d)
513+
var filtered_data = data.filter(n => n.publication_year <= d)
531514
graph.graphData({
532515
nodes: filtered_data,
533516
links: getEdges(filtered_data)
534517
}).nodeLabel(node => `<div class="nodetooltip">${generateHTMLTooltip(node)}</div>`)
535518
return;
536519
}
537520
if (this.value == "author") {
538-
var objkey = "authors"
539-
var subkey = "AuId"
540-
var namekey = "DAuN"
521+
var objkey = "authorships"
522+
var subkey = "author.id"
523+
var namekey = "author.display_name"
541524
} else if (this.value == "inst") {
542-
var objkey = "authors"
543-
var subkey = "AfId"
544-
var namekey = "DAfN"
525+
var objkey = "authorships"
526+
var subkey = "institutions[0].id"
527+
var namekey = "institutions[0].display_name"
545528
} else if (this.value == "field") {
546-
var objkey = "fields"
547-
var subkey = "FId"
548-
var namekey = "DFN"
529+
var objkey = "concepts"
530+
var subkey = "id"
531+
var namekey = "display_name"
549532
} else if (this.value == "journal") {
550-
var objkey = "journals"
551-
var subkey = "JId"
552-
var namekey = "JN"
533+
var objkey = "locations"
534+
var subkey = "source.id"
535+
var namekey = "source.display_name"
553536
}
554537
var groupedData = {};
555538
for (var node of data) {
556-
if (!node[objkey]) continue;
557539
var seen_ids_this_node = {}
558540
for (var obj of node[objkey]) {
559-
var id = obj[subkey]
541+
var id = _.get(obj, subkey)
560542
if (!id || seen_ids_this_node[id]) continue
561543
seen_ids_this_node[id] = true
562544
if (!groupedData[id]) {
563545
groupedData[id] = {
564546
id: id,
565-
citation_count: 0,
566-
name: obj[namekey],
567-
date_published: node.date_published,
568-
references: {}
547+
cited_by_count: 0,
548+
name: _.get(obj, namekey),
549+
date_published: node.publication_date,
550+
referenced_works: []
569551
}
570552
}
571-
groupedData[id].citation_count += node.citation_count
572-
if (groupedData[id].date_published > node.date_published) groupedData[id].date_published = node.date_published;
573-
for (var ref in node.references) {
574-
ref = parseInt(ref)
553+
groupedData[id].cited_by_count += node.cited_by_count
554+
if (groupedData[id].publication_date > node.publication_date) groupedData[id].publication_date = node.publication_date;
555+
for (var ref of node.referenced_works) {
575556
var paper = paper_lookup[ref];
557+
if (!paper) continue
576558
if (!paper[objkey]) continue
577559
for (var other of paper[objkey]) {
578-
var otherId = other[subkey]
560+
var otherId = _.get(other, subkey)
579561
if (!otherId) continue
580-
if (!groupedData[id].references[otherId]) groupedData[id].references[otherId] = 0;
581-
groupedData[id].references[otherId]++;
562+
if (!groupedData[id].referenced_works.includes(otherId)) groupedData[id].referenced_works.push(otherId)
582563
}
583564
}
584565
}
@@ -594,7 +575,7 @@ <h5 id="modal-title" class="modal-title" id="exampleModalLongTitle"></h5>
594575
graph.graphData({
595576
nodes: filteredData,
596577
links: getEdges(filteredData)
597-
}).nodeLabel(n => `<div class="nodetooltip">${n.name}<br>Citations: ${n.citation_count}</div>`)
578+
}).nodeLabel(n => `<div class="nodetooltip">${n.name}<br>Citations: ${n.cited_by_count}</div>`)
598579
.linkAutoColorBy(e => e.count)
599580
.linkWidth(e => e.count)
600581
})

network_stats.py

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!/usr/bin/env python3
2+
3+
from glob import glob
4+
import json
5+
6+
files = sorted(glob("data/*.json"))
7+
8+
for f in files:
9+
network = json.load(open(f))
10+
known_ids = set(n["id"] for n in network)
11+
edges = sum(sum(r in known_ids for r in n["referenced_works"]) for n in network)
12+
print(f"{f}: {len(network)} nodes, {edges} edges")

0 commit comments

Comments
 (0)