Source code for src.importer.parser

"""Parser for Apple Health export files.

Reads the export.zip produced by the Apple Health app and returns a single
tidy DataFrame that can be saved to by the caller.
"""

import logging
import zipfile
from pathlib import Path

import defusedxml.ElementTree as ETree
import pandas as pd

logger = logging.getLogger(__name__)

# columns of parse_apple_health output df
_COLUMNS = (
    "type",
    "sourceName",
    "sourceVersion",
    "device",
    "unit",
    "startDate",
    "endDate",
    "creationDate",
    "value",
)


[docs] class NoHealthDataError(ValueError): """Raised when trying to parse a zip file without usable data.""" pass
[docs] def parse_apple_health(zip_path: str | Path) -> pd.DataFrame: """Parse an Apple Health export archive into a DataFrame. Reads all Record elements from the export.xml inside the given export.zip and returns them as a single tidy DataFrame with timezone-aware datetime columns. The output schema is a superset of the one produced by apple-health-exporter's health_xml_to_feather, adding device, sourceVersion, and creationDate columns. Args: zip_path: Path to the export.zip file generated by the Apple Health app. Returns: A DataFrame with one row per health record and the following columns: - type (str): The HealthKit record type, e.g. ``HKQuantityTypeIdentifierHeartRate``. - sourceName (str): Human-readable name of the recording source, e.g. ``"Apple Watch"``. - sourceVersion (str): Version string of the recording source app. - device (str): Full device identifier string as reported by HealthKit. - unit (str): Unit of measurement, e.g. ``"count/min"`` or ``"kg"``. - startDate (datetime64[ns]): Start of the recorded interval. - endDate (datetime64[ns]): End of the recorded interval. - creationDate (datetime64[ns]): When the record was created - value (str): The recorded value as a string, e.g. ``"72"`` or ``"5.6"``. Cast to a numeric type by the caller if needed. Raises: NoHealthDataError: If the zip_path file contains none of the ``_COLUMNS`` data. Example: >>> df = parse_apple_health("export.zip") >>> df.to_feather("data.feather") >>> heart_rate = df[df["type"] == "HKQuantityTypeIdentifierHeartRate"] >>> heart_rate["value"].astype(float).mean() """ rows = [] with ( zipfile.ZipFile(zip_path) as zf, zf.open("apple_health_export/export.xml") as f, ): for _event, elem in ETree.iterparse(f, events=("end",)): if elem.tag == "Record": rows.append({col: elem.attrib.get(col) for col in _COLUMNS}) elem.clear() df = pd.DataFrame(rows) if df.empty: logger.error(f"No records found in zip file {zip_path}.") raise NoHealthDataError return df