Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {},
"inputWidgets": {},
"nuid": "3a75cbac-af5e-471b-8b17-9b979ab99dcb",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"source": [
"\n",
"# Notebook Overview\n",
"\n",
"This notebook demonstrates how to download, verify, and manage DICOM test data from the OpenSlide project. It includes steps to fetch metadata, download files with integrity checks, and inspect the downloaded dataset. The workflow is designed for use in Databricks environments, following workspace policies for responsible resource usage and data management.\n",
"\n",
"## Requirements\n",
"- Serverless Notebook Compute\n",
"- Assumes catalog, schema, volume `hls_radiology.openslide-cs-cmu-edu.test-data` UC volume\n"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "1b12fe7a-1bf1-444d-b1c8-b6f803b3f3a3",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"outputs": [],
"source": [
"%sh \n",
"cd /Volumes/hls_radiology/openslide-cs-cmu-edu/test-data/DICOM\n",
"pwd\n",
"ls\n",
"\n",
"curl -O https://openslide.cs.cmu.edu/download/openslide-testdata/DICOM/index.yaml"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "460eea7f-46be-452b-a66f-c8dad1bb75da",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"outputs": [],
"source": [
"# %sh rm -rf /Volumes/hls_radiology/openslide-cs-cmu-edu/test-data/DICOM/*"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "a4de3025-dd30-4e84-9e51-56b174088f40",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"outputs": [],
"source": [
"%sh ls /Volumes/hls_radiology/openslide-cs-cmu-edu/test-data/DICOM/"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "db48f882-9692-45aa-8458-41151f05ba21",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"outputs": [],
"source": [
"import os\n",
"import requests\n",
"import tempfile\n",
"import yaml\n",
"import hashlib\n",
"\n",
"path = \"/Volumes/hls_radiology/openslide-cs-cmu-edu/test-data/DICOM/index.yaml\"\n",
"base_url = \"https://openslide.cs.cmu.edu/download/openslide-testdata/DICOM/\"\n",
"base_path = os.path.dirname(path)\n",
"\n",
"with open(path) as file:\n",
" f = yaml.load(file, Loader=yaml.FullLoader)\n",
" slides = f.get(\"slides\")\n",
" for file_name in slides:\n",
" expected_hash = slides[file_name].get(\"sha256\")\n",
" file_path = os.path.join(base_path, file_name)\n",
" if not os.path.exists(file_path):\n",
" print(f\"Downloading {file_name} to {file_path}\")\n",
" url = base_url + file_name\n",
" try:\n",
" with requests.get(url, stream=True) as r:\n",
" r.raise_for_status()\n",
" with tempfile.NamedTemporaryFile(delete=False, dir=base_path) as tmp_f:\n",
" for chunk in r.iter_content(chunk_size=65536):\n",
" tmp_f.write(chunk)\n",
" print(\".\", end=\"\", flush=True)\n",
" # Calculate sha256 hash\n",
" sha256_hash = hashlib.sha256()\n",
" with open(tmp_f.name, \"rb\") as f_in:\n",
" for byte_block in iter(lambda: f_in.read(65536), b\"\"):\n",
" sha256_hash.update(byte_block)\n",
" calculated_hash = sha256_hash.hexdigest()\n",
"\n",
" if calculated_hash == expected_hash:\n",
" print(f\"SHA256 hash matches for {file_name}\")\n",
" os.rename(tmp_f.name, file_path)\n",
" print(f\"File '{file_name}' downloaded successfully.\")\n",
" else:\n",
" print(f\"SHA256 hash mismatch for {file_name}: expected {expected_hash}, got {calculated_hash}\")\n",
" os.remove(tmp_f.name)\n",
" except requests.exceptions.RequestException as e:\n",
" print(f\"Error downloading file: {e}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "6fa66e33-03f3-499a-9782-8f4b4ba0ccf3",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"outputs": [],
"source": [
"%sh ls -alH /Volumes/hls_radiology/openslide-cs-cmu-edu/test-data/DICOM"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "e60d5d3a-d3fd-42aa-a8de-c291482acafb",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"outputs": [],
"source": []
}
],
"metadata": {
"application/vnd.databricks.v1+notebook": {
"computePreferences": null,
"dashboards": [],
"environmentMetadata": {
"base_environment": "dbe_0c235d96-4bc7-4fb5-b118-17fd1dad0124",
"environment_version": "4"
},
"inputWidgetPreferences": null,
"language": "python",
"notebookMetadata": {
"mostRecentlyExecutedCommandWithImplicitDF": {
"commandId": 8149239478262711,
"dataframes": [
"_sqldf"
]
},
"pythonIndentUnit": 2
},
"notebookName": "Openslide Sample Data downloader Notebook",
"widgets": {}
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}