-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.qmd
167 lines (156 loc) · 5.97 KB
/
index.qmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
---
title: "Vega datasets"
toc: true
---
```{python}
from vega_datasets import data
import pandas as pd
from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)
```
```{python}
for dataset in data.list_datasets():
print(dataset)
```
```{python}
import pandas as pd
from vega_datasets import data
from itables import show
import textwrap
dataset_descriptions = {
"7zip": "Application icon from the open-source 7-Zip software project.",
"airports": "Airport data, source not specified in the original description.",
"annual-precip": "A raster grid of global annual precipitation for the year 2016 at a resolution 1 degree of lon/lat per cell, from CFSv2.",
"anscombe": "Dataset from 'Graphs in Statistical Analysis' by F. J. Anscombe, The American Statistician.",
"barley": "Results from a 1930s agricultural experiment in Minnesota, containing yields for 10 different varieties of barley at six different sites.",
"birdstrikes": "Bird strike data from http://wildlife.faa.gov",
"budget": "U.S. Budget data for FY 2016 from the Office of Management and Budget.",
"burtin": "Dataset based on Will Burtin's 1951 visualization of antibiotic effectiveness, comparing three antibiotics against 16 bacteria.",
"cars": "Car statistics dataset from http://lib.stat.cmu.edu/datasets/",
"countries": "Demographic indicators (life expectancy and fertility rate) for various countries from 1955 to 2000 at 5-year intervals.",
"crimea": "Data related to Florence Nightingale's famous 'Coxcomb' chart about mortality in the Crimean War.",
"disasters": "Natural disaster data from https://ourworldindata.org/natural-catastrophes",
"driving": "Driving statistics, possibly related to a New York Times article from May 2, 2010.",
"earthquakes": "Earthquake data from USGS, captured on Feb 6, 2018.",
"ffox": "Application icon from the open-source Firefox software project.",
"flare": "Hierarchical data, possibly related to software architecture or dependencies.",
"flights-10k": "Flight delay statistics from U.S. Bureau of Transportation Statistics, subset of 10,000 records.",
"gapminder": "Demographic indicators (life expectancy, population, and fertility rate) for various countries from 1955 to 2005 at 5-year intervals.",
"gimp": "Application icon from the open-source GIMP software project.",
"github": "GitHub-related data, generated using a custom script.",
"iowa-electricity": "Annual net generation of electricity in Iowa by source, compiled by the U.S. Energy Information Administration.",
"jobs": "U.S. census data on occupations by sex and year across decades between 1850 and 2000.",
"miserables": "Co-occurrence of characters in Victor Hugo's Les Misérables.",
"monarchs": "Chronological list of English and British monarchs from Elizabeth I through George IV.",
"movies": "Movie dataset with intentionally included errors for instructional purposes.",
"population": "United States population statistics by sex and age group across decades between 1850 and 2000.",
"seattle-weather": "Daily weather records for Seattle with metric units, synthesized from NOAA data.",
"sp500": "S&P 500 index values from 2000 to 2020, retrieved from Yahoo Finance.",
"unemployment": "County-level unemployment rates in the United States, generally consistent with 2009 levels.",
"us-10m": "U.S. map data at 1:10 million scale.",
"volcano": "Topographic information for Maunga Whau (Mt Eden) volcano in Auckland, New Zealand.",
"weather": "Instructional dataset showing actual and predicted temperature data.",
"wheat": "250 years of wheat prices alongside weekly wages and reigning monarchs, based on William Playfair's 1822 chart.",
"zipcodes": "Zipcode data from GeoNames.org",
}
def get_dataset_info(dataset_name):
try:
df = data(dataset_name)
if len(df) > 100:
df = df.head(100)
return {
"name": dataset_name,
"df": df,
"rows": len(df),
"columns": len(df.columns),
"size_kb": df.memory_usage(deep=True).sum() / 1024,
"description": dataset_descriptions.get(
dataset_name, "No description available."
),
}
except Exception as e:
return None
dataset_names = [
"7zip",
"airports",
"annual-precip",
"anscombe",
"barley",
"birdstrikes",
"budget",
"budgets",
"burtin",
"cars",
"climate",
"co2-concentration",
"countries",
"crimea",
"disasters",
"driving",
"earthquakes",
"ffox",
"flare",
"flare-dependencies",
"flights-10k",
"flights-20k",
"flights-2k",
"flights-5k",
"flights-airport",
"gapminder",
"gapminder-health-income",
"gimp",
"github",
"graticule",
"income",
"iowa-electricity",
"iris",
"jobs",
"la-riots",
"londonBoroughs",
"londonCentroids",
"londonTubeLines",
"lookup_groups",
"lookup_people",
"miserables",
"monarchs",
"movies",
"normal-2d",
"obesity",
"ohlc",
"points",
"population",
"population_engineers_hurricanes",
"seattle-temps",
"seattle-weather",
"sf-temps",
"sp500",
"stocks",
"udistrict",
"unemployment",
"unemployment-across-industries",
"uniform-2d",
"us-10m",
"us-employment",
"us-state-capitals",
"volcano",
"weather",
"weball26",
"wheat",
"windvectors",
"world-110m",
"zipcodes",
]
dataset_info = [get_dataset_info(name) for name in dataset_names]
dataset_info = [
info for info in dataset_info if info is not None and info["size_kb"] > 0
]
for info in dataset_info:
print(f"\n\n## {info['name']}")
print(
f"Rows: {info['rows']}, Columns: {info['columns']}, Size: {info['size_kb']:.2f} KB"
)
print("\n**Description:**")
print(textwrap.fill(info["description"], width=80))
show(info["df"], caption=f"Top 100 rows of {info['name']} dataset")
print("\n")
```