Source code for src.importer.parser
"""Parser for Apple Health export files.
Reads the export.zip produced by the Apple Health app and returns a single
tidy DataFrame that can be saved to by the caller.
"""
import logging
import zipfile
from pathlib import Path
import defusedxml.ElementTree as ETree
import pandas as pd
logger = logging.getLogger(__name__)
# columns of parse_apple_health output df
_COLUMNS = (
"type",
"sourceName",
"sourceVersion",
"device",
"unit",
"startDate",
"endDate",
"creationDate",
"value",
)
[docs]
class NoHealthDataError(ValueError):
"""Raised when trying to parse a zip file without usable data."""
pass
[docs]
def parse_apple_health(zip_path: str | Path) -> pd.DataFrame:
"""Parse an Apple Health export archive into a DataFrame.
Reads all Record elements from the export.xml inside the given export.zip
and returns them as a single tidy DataFrame with timezone-aware datetime
columns. The output schema is a superset of the one produced by
apple-health-exporter's health_xml_to_feather, adding device, sourceVersion,
and creationDate columns.
Args:
zip_path: Path to the export.zip file generated by the Apple Health app.
Returns:
A DataFrame with one row per health record and the following columns:
- type (str): The HealthKit record type, e.g.
``HKQuantityTypeIdentifierHeartRate``.
- sourceName (str): Human-readable name of the recording source, e.g.
``"Apple Watch"``.
- sourceVersion (str): Version string of the recording source app.
- device (str): Full device identifier string as reported by HealthKit.
- unit (str): Unit of measurement, e.g. ``"count/min"`` or ``"kg"``.
- startDate (datetime64[ns]): Start of the recorded interval.
- endDate (datetime64[ns]): End of the recorded interval.
- creationDate (datetime64[ns]): When the record was created
- value (str): The recorded value as a string, e.g. ``"72"`` or
``"5.6"``. Cast to a numeric type by the caller if needed.
Raises:
NoHealthDataError: If the zip_path file contains none of the ``_COLUMNS`` data.
Example:
>>> df = parse_apple_health("export.zip")
>>> df.to_feather("data.feather")
>>> heart_rate = df[df["type"] == "HKQuantityTypeIdentifierHeartRate"]
>>> heart_rate["value"].astype(float).mean()
"""
rows = []
with (
zipfile.ZipFile(zip_path) as zf,
zf.open("apple_health_export/export.xml") as f,
):
for _event, elem in ETree.iterparse(f, events=("end",)):
if elem.tag == "Record":
rows.append({col: elem.attrib.get(col) for col in _COLUMNS})
elem.clear()
df = pd.DataFrame(rows)
if df.empty:
logger.error(f"No records found in zip file {zip_path}.")
raise NoHealthDataError
return df