In [1]:
import json
import numpy as np
from dandi.dandiapi import DandiAPIClient
from tqdm.notebook import tqdm
from isodate import parse_duration, Duration
from datetime import datetime

In [2]:
client = DandiAPIClient()
dandisets = list(client.get_dandisets())

# More specific identification of NWB dandisets

The simpler tutorial only tested if the phrase "NWB" was in the name of any of the data standards for a dandiset.

The more official and precise method is to use the specific [RRID of NWB](https://scicrunch.org/resolver/RRID:SCR_015242), which is `"RRID:SCR_015242"`.

In [3]:
nwb_dandisets = []

for dandiset in tqdm(dandisets):
    raw_metadata = dandiset.get_raw_metadata()

    if any(
        data_standard['identifier'] == "RRID:SCR_015242"  # this is the RRID for NWB
        for data_standard in raw_metadata['assetsSummary'].get('dataStandard', [])
    ):
        nwb_dandisets.append(dandiset)
print(f"There are currently {len(nwb_dandisets)} NWB datasets on DANDI!")

  0%|          | 0/465 [00:00<?, ?it/s]

There are currently 277 NWB datasets on DANDI!


# Average age of subjects used in a dandiset

Let's consider a more advanced calculation - finding the average age of all the subjects used in a particular dandiset.

For this we will be directly accessing the asset level fields `wasAttributedTo` as a key of the `asset_metadata`, instead of as an attribute.

We will also have to do some manual data manipulation to parse the form of the [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601).

In [4]:
def iso_to_fractional_days(age_iso: str, experiment_date: str) -> float:
    """
    Defining a helper function which parses the ISO 8601 age and returns it in float-valued seconds.
    
    This is because a dattetime.timedelta can only return either its `.days` (integer, rounded down) or
    its `total_seconds()`.
    
    This helper also resolves some complications that can arise in other datasets when the age is measured in years,
    or if the the age is a range.
    """
    if "/" in age_iso:  # Some ages can be have upper and lower ranges due to uncertainty
        return  # Skip

    age_duration = parse_duration(datestring=age_iso)

    if isinstance(age_duration, Duration):
        experiment_datetime = datetime.fromisoformat(experiment_date)
        time_delta = age_duration.totimedelta(end=experiment_datetime)
    else:
        time_delta = age_duration

    return time_delta.total_seconds() / (  # Evaluate using the total number of seconds
        60 *  # 60 seconds per minute
        60 *  # 60 minutes per hour
        24  # 24 hours per day (ignoring daylight savings time)
    )


all_subject_ages_in_days = []

dandiset = client.get_dandiset("000398")
assets = list(dandiset.get_assets())
for asset in tqdm(assets):
    raw_metadata = asset.get_raw_metadata()
    subjects = raw_metadata["wasAttributedTo"]

    for subject_metadata in subjects:
        if "age" in subject_metadata:
            age_in_days = iso_to_fractional_days(
                age_iso=subject_metadata["age"]["value"],
                experiment_date=raw_metadata["wasGeneratedBy"][0]["startDate"]
            )

            if age_in_days:  # Skip if the age is null
                all_subject_ages_in_days.append(age_in_days)
print(f"The average age of the subjects in dandiset #398 is: {np.mean(all_subject_ages_in_days)} days")

  0%|          | 0/42 [00:00<?, ?it/s]

The average age of the subjects in dandiset #398 is: 170.74276620370375 days


# Count the number of spiking units across all sessions in an experiment

The number of units identified from spike sorting is not something that DANDI extracts automatically during upload...

But we can calculate it ourselves without downloading an entire dandiset!

We do this by streaming directly from the archive, which requires us to retrieve the asset path on the S3 backend of the DANDI archive and then set the `driver` argument to `ros3` (Read-Only S3).

There are several ways to retrieve the S3 path, but the easiest is to use the NWB Inspector helper function `nwbinspector.tools.get_s3_urls_and_dandi_paths`, which will format the path in the way `ros3` expects.

In [9]:
!pip install remfile

Collecting remfile
  Downloading remfile-0.1.13-py3-none-any.whl.metadata (3.7 kB)
Downloading remfile-0.1.13-py3-none-any.whl (11 kB)
Installing collected packages: remfile
Successfully installed remfile-0.1.13


In [12]:
from warnings import simplefilter
simplefilter("ignore")  # Suppress namespace warnings from reading older NWB files

from nwbinspector.tools import get_s3_urls_and_dandi_paths
from pynwb import NWBHDF5IO
import remfile
import h5py

In [14]:
s3_urls = get_s3_urls_and_dandi_paths(dandiset_id="000059")

num_units_per_asset = dict()
for s3_url in tqdm(s3_urls):

    rem_file = remfile.File(s3_url)
    h5py_file = h5py.File(rem_file, "r")
    io = NWBHDF5IO(file=h5py_file, load_namespaces=True)
    nwbfile = io.read()
    
    if nwbfile.units:
        num_units_per_asset.update({s3_url: len(nwbfile.units)})

  0%|          | 0/100 [00:00<?, ?it/s]

In [15]:
num_units_per_asset

{'https://dandiarchive.s3.amazonaws.com/blobs/093/2c2/0932c245-ac35-4dfd-be76-20ae328f43a4': 395,
 'https://dandiarchive.s3.amazonaws.com/blobs/dac/f24/dacf2486-4b1c-4b7f-b3a6-43453670e436': 381,
 'https://dandiarchive.s3.amazonaws.com/blobs/fd9/600/fd9600d5-3623-48fb-b84b-796b7e00b9cf': 365,
 'https://dandiarchive.s3.amazonaws.com/blobs/f4f/37e/f4f37e2f-5b44-4ab9-a8a0-9fbdc218b357': 344,
 'https://dandiarchive.s3.amazonaws.com/blobs/235/5bd/2355bdc4-394a-4a9e-bbd7-fbfcc4d9ec7b': 361,
 'https://dandiarchive.s3.amazonaws.com/blobs/a19/13b/a1913b51-ab11-42af-a5ca-7b346d59ef37': 915,
 'https://dandiarchive.s3.amazonaws.com/blobs/927/c68/927c68bb-82c2-4cdb-bde8-af836d0d1664': 1114,
 'https://dandiarchive.s3.amazonaws.com/blobs/8ea/05c/8ea05cea-0159-4b78-9058-30de1348e7c7': 468,
 'https://dandiarchive.s3.amazonaws.com/blobs/824/ef1/824ef138-56b9-4abc-bb3f-233d623c9f3e': 928,
 'https://dandiarchive.s3.amazonaws.com/blobs/cd1/1ae/cd11ae52-a90d-48cc-bc16-c90113fd094b': 1024,
 'https://dandiarc

In [16]:
print(f"Dandiset #59 has a total of {sum(num_units_per_asset.values())} identified spiking units!")

Dandiset #59 has a total of 22319 identified spiking units!


# Going beyond
These examples show a few types of queries, but since the metadata structures are quite rich on both the dandiset and asset levels, they enable many complex queries beyond the examples here.

These metadata structures are also expanding over time as DANDI becomes more strict about what counts as essential metadata.

The `.get_raw_metadata` method of both `client.get_dandiset(...)` and `client.get_dandiset(...).get_assets()` provides a nice view into the available fields.

Note: for any attribute, it is recommended to first check that it is not `None` before checking for its value.

In [17]:
print(json.dumps(assets[0].get_raw_metadata(), indent=4))

{
    "id": "dandiasset:11c25674-6eff-43a8-8dba-7dea2e8c76c4",
    "path": "sub-San4/sub-San4_ses-20200302T142114_ecephys.nwb",
    "access": [
        {
            "status": "dandi:OpenAccess",
            "schemaKey": "AccessRequirements"
        }
    ],
    "digest": {
        "dandi:dandi-etag": "4c907ae8685aea1bfbe57316942b881f-4",
        "dandi:sha2-256": "b770e3ac3f75f40618de2ba2a81e996429d5fc01dd530e9d826acc7a1ad0853c"
    },
    "@context": "https://raw.githubusercontent.com/dandi/schema/master/releases/0.6.3/context.json",
    "approach": [
        {
            "name": "electrophysiological approach",
            "schemaKey": "ApproachType"
        }
    ],
    "schemaKey": "Asset",
    "contentUrl": [
        "https://api.dandiarchive.org/api/assets/11c25674-6eff-43a8-8dba-7dea2e8c76c4/download/",
        "https://dandiarchive.s3.amazonaws.com/blobs/429/baa/429baaad-a057-411d-8957-8460947aef73"
    ],
    "identifier": "11c25674-6eff-43a8-8dba-7dea2e8c76c4",
    "content