Skip to content

Commit f1fd3de

Browse files
authored
Merge branch 'master' into report
2 parents e48793c + 9d5733d commit f1fd3de

File tree

8 files changed

+35494
-1354
lines changed

8 files changed

+35494
-1354
lines changed

Notebook_1_SQL_database.ipynb

Lines changed: 1137 additions & 1185 deletions
Large diffs are not rendered by default.

Notebook_2_API_queries.ipynb

Lines changed: 210 additions & 147 deletions
Large diffs are not rendered by default.

Notebook_3_metadata_overview.ipynb

Lines changed: 44 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
},
1010
{
1111
"cell_type": "code",
12-
"execution_count": 96,
12+
"execution_count": 22,
1313
"metadata": {},
1414
"outputs": [
1515
{
@@ -47,21 +47,19 @@
4747
},
4848
{
4949
"cell_type": "code",
50-
"execution_count": 97,
50+
"execution_count": 23,
5151
"metadata": {},
5252
"outputs": [],
5353
"source": [
5454
"# load metadata\n",
5555
"\n",
5656
"df_meta = pd.read_csv(\"datasets_output/df_pub.csv\",compression=\"gzip\")\n",
57-
"df_cord = pd.read_csv(\"datasets_output/sql_tables/cord19_metadata.csv\",sep=\"\\t\",header=None,names=[ 'cord19_metadata_id', 'source', 'license', 'full_text_file', 'ms_academic_id',\n",
58-
" 'who_covidence', 'sha', 'full_text', 'pub_id'])\n",
5957
"df_meta.drop(columns=\"Unnamed: 0\",inplace=True)"
6058
]
6159
},
6260
{
6361
"cell_type": "code",
64-
"execution_count": 98,
62+
"execution_count": 24,
6563
"metadata": {},
6664
"outputs": [
6765
{
@@ -222,7 +220,7 @@
222220
"4 2020-03-28 08:46:55.291546 "
223221
]
224222
},
225-
"execution_count": 98,
223+
"execution_count": 24,
226224
"metadata": {},
227225
"output_type": "execute_result"
228226
}
@@ -233,7 +231,7 @@
233231
},
234232
{
235233
"cell_type": "code",
236-
"execution_count": 99,
234+
"execution_count": 25,
237235
"metadata": {},
238236
"outputs": [
239237
{
@@ -245,7 +243,7 @@
245243
" dtype='object')"
246244
]
247245
},
248-
"execution_count": 99,
246+
"execution_count": 25,
249247
"metadata": {},
250248
"output_type": "execute_result"
251249
}
@@ -263,7 +261,7 @@
263261
},
264262
{
265263
"cell_type": "code",
266-
"execution_count": 100,
264+
"execution_count": 6,
267265
"metadata": {},
268266
"outputs": [],
269267
"source": [
@@ -283,7 +281,7 @@
283281
},
284282
{
285283
"cell_type": "code",
286-
"execution_count": 101,
284+
"execution_count": 7,
287285
"metadata": {},
288286
"outputs": [
289287
{
@@ -300,7 +298,7 @@
300298
"Name: publication_year, dtype: float64"
301299
]
302300
},
303-
"execution_count": 101,
301+
"execution_count": 7,
304302
"metadata": {},
305303
"output_type": "execute_result"
306304
}
@@ -311,16 +309,16 @@
311309
},
312310
{
313311
"cell_type": "code",
314-
"execution_count": 102,
312+
"execution_count": 8,
315313
"metadata": {},
316314
"outputs": [
317315
{
318316
"data": {
319317
"text/plain": [
320-
"<matplotlib.axes._subplots.AxesSubplot at 0x1a6fc3d6d0>"
318+
"<matplotlib.axes._subplots.AxesSubplot at 0x1a232c0d50>"
321319
]
322320
},
323-
"execution_count": 102,
321+
"execution_count": 8,
324322
"metadata": {},
325323
"output_type": "execute_result"
326324
},
@@ -341,16 +339,16 @@
341339
},
342340
{
343341
"cell_type": "code",
344-
"execution_count": 103,
342+
"execution_count": 9,
345343
"metadata": {},
346344
"outputs": [
347345
{
348346
"data": {
349347
"text/plain": [
350-
"<matplotlib.axes._subplots.AxesSubplot at 0x1a6fdae150>"
348+
"<matplotlib.axes._subplots.AxesSubplot at 0x1a1ca2c990>"
351349
]
352350
},
353-
"execution_count": 103,
351+
"execution_count": 9,
354352
"metadata": {},
355353
"output_type": "execute_result"
356354
},
@@ -369,6 +367,35 @@
369367
"sns.distplot(df_meta[(pd.notnull(df_meta.publication_year)) & (df_meta.publication_year > 2000)].publication_year.tolist(), bins=20, hist=True, kde=False)"
370368
]
371369
},
370+
{
371+
"cell_type": "code",
372+
"execution_count": 14,
373+
"metadata": {},
374+
"outputs": [],
375+
"source": [
376+
"df_meta[\"abstract_length\"] = df_meta.abstract.str.len()"
377+
]
378+
},
379+
{
380+
"cell_type": "code",
381+
"execution_count": 18,
382+
"metadata": {},
383+
"outputs": [
384+
{
385+
"data": {
386+
"text/plain": [
387+
"(39154, 14)"
388+
]
389+
},
390+
"execution_count": 18,
391+
"metadata": {},
392+
"output_type": "execute_result"
393+
}
394+
],
395+
"source": [
396+
"df_meta[df_meta.abstract_length>0].shape"
397+
]
398+
},
372399
{
373400
"cell_type": "markdown",
374401
"metadata": {},

README.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@ This workflow can be illustrated as follows:
1616

1717
For the moment, we consider publications from the following sources:
1818

19-
* [CORD19](https://pages.semanticscholar.org/coronavirus-research) (last updated March 28, 2020):
20-
* [Dimensions](https://docs.google.com/spreadsheets/d/1-kTZJZ1GAhJ2m4GAIhw1ZdlgO46JpvX0ZQa232VWRmw/edit#gid=2034285255) (last updated March 28, 2020):
21-
* [WHO](https://www.who.int/emergencies/diseases/novel-coronavirus-2019/global-research-on-novel-coronavirus-2019-ncov) (last updated March 28, 2020)
19+
* [CORD19](https://pages.semanticscholar.org/coronavirus-research) (last updated April 10, 2020):
20+
* [Dimensions](https://docs.google.com/spreadsheets/d/1-kTZJZ1GAhJ2m4GAIhw1ZdlgO46JpvX0ZQa232VWRmw/edit#gid=2034285255) (last updated April 10, 2020):
21+
* [WHO](https://www.who.int/emergencies/diseases/novel-coronavirus-2019/global-research-on-novel-coronavirus-2019-ncov) (last updated April 10, 2020)
2222

2323
You will need to download these datasets and add them to a local folder in order to process them. We assume that you will have a local copy of the whole CORD19 dataset, and a `csv` file with publication metadata for Dimensions and WHO. Previous releases of the Dimensions and WHO lists can be found in the [datasets_input](datasets_input) folder. Please also see the notebooks below for more details.
2424

@@ -39,14 +39,14 @@ You can use the [Notebook_1_SQL_database](Notebook_1_SQL_database.ipynb) noteboo
3939
* The `pub` table contains publications from all data sources. If you would like to work with publications coming exclusively from one data source, join it with the `datasource` table via the `pub_datasource` table.
4040
* The primary keys of all tables (`pub_id`, `covid19_mtadata_id`, `who_metadata_id`, `dimensions_metadata_id`, `datasource_id`) are not stable and are only internally consistent: if you create different versions of the database, they will likely differ.
4141
* In order to work with Dimensions and Altmetrics data, *publication identifiers* should be used. Please give preference to DOIs, then to PMIDs, then to PMCIDs (listed in order of coverage).
42-
* We removed a few (<1000) publications which had no known identifier among these three options. These are usually pre-prints, which are likely to be equipped with an identifier in future releases.
42+
* We removed a few (~1200) publications which had no known identifier among these three options. These are usually pre-prints, which are likely to be equipped with an identifier in future releases.
4343
* The `metadata` tables contain fields which are specific to a datasource, and we considered potentially useful. They are only available for publications coming from that datasource.
4444

4545
### Query Dimensions and Altmetrics
4646

4747
You can then query [Dimensions](https://docs.dimensions.ai/dsl) and [Altmetrics](https://api.altmetric.com) APIs using your own keys, using the [Notebook_2_API_queries](Notebook_2_API_queries.ipynb) notebook. You can request access as a researcher here: https://www.dimensions.ai/scientometric-research.
4848

49-
### Data overview
49+
### Data analysis
5050

5151
Using the [Notebook_3_metadata_overview](Notebook_3_metadata_overview.ipynb) and [Notebook_4_API_data_overview](Notebook_4_API_data_overview.ipynb) notebooks, you can get an overview of some of the resulting metadata and data.
5252

0 commit comments

Comments
 (0)