{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "%matplotlib inline"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n\n# The final ECG Example dataset\n\nThis is the final ECG Example dataset, that we developed step by step in the example `custom_dataset_ecg`.\nThis file can be used as quick reference or to import the class into other examples without side effects.\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "from functools import lru_cache\nfrom itertools import cycle\nfrom pathlib import Path\nfrom typing import List, Optional, Union\n\nimport pandas as pd\n\nfrom tpcp import Dataset\n\n\ndef load_pandas_pickle_file(file_path):\n    return pd.read_pickle(file_path)\n\n\ncached_load_pandas_pickle_file = lru_cache(10)(load_pandas_pickle_file)\n\n\nclass ECGExampleData(Dataset):\n    data_path: Path\n    use_lru_cache: bool\n\n    def __init__(\n        self,\n        data_path: Path,\n        *,\n        use_lru_cache: bool = True,\n        groupby_cols: Optional[Union[List[str], str]] = None,\n        subset_index: Optional[pd.DataFrame] = None,\n    ):\n        self.data_path = data_path\n        self.use_lru_cache = use_lru_cache\n        super().__init__(groupby_cols=groupby_cols, subset_index=subset_index)\n\n    @property\n    def sampling_rate_hz(self) -> float:\n        \"\"\"The sampling rate of the raw ECG recording in Hz\"\"\"\n        return 360.0\n\n    @property\n    def data(self) -> pd.DataFrame:\n        \"\"\"The raw ECG data of a participant's recording.\n\n        The dataframe contains a single column called \"ecg\".\n        The index values are just samples.\n        You can use the sampling rate (`self.sampling_rate_hz`) to convert it into time\n        \"\"\"\n        # Check that there is only a single participant in the dataset\n        self.assert_is_single(None, \"data\")\n        # Reconstruct the ecg file path based on the data index\n        p_id = self.index[\"participant\"][0]\n        file_path = self.data_path / f\"{p_id}.pk.gz\"\n        # We try to use the cache if enabled.\n        if self.use_lru_cache:\n            return cached_load_pandas_pickle_file(file_path)\n        return load_pandas_pickle_file(file_path)\n\n    @property\n    def r_peak_positions_(self) -> pd.DataFrame:\n        \"\"\"The sample positions of all R-peaks in the ECG data.\n\n        This includes all R-Peaks (PVC or normal)\n        \"\"\"\n        self.assert_is_single(None, \"r_peaks_\")\n        p_id = self.group.participant\n        r_peaks = pd.read_csv(self.data_path / f\"{p_id}_all.csv\", index_col=0)\n        r_peaks = r_peaks.rename(columns={\"R\": \"r_peak_position\"})\n        return r_peaks\n\n    @property\n    def pvc_positions_(self) -> pd.DataFrame:\n        \"\"\"The positions of R-peaks belonging to abnormal PVC peaks in the data stream.\n\n        The position is equivalent to a position entry in `self.r_peak_positions_`.\n        \"\"\"\n        self.assert_is_single(None, \"pvc_positions_\")\n        p_id = self.group.participant\n        pvc_peaks = pd.read_csv(self.data_path / f\"{p_id}_pvc.csv\", index_col=0)\n        pvc_peaks = pvc_peaks.rename(columns={\"PVC\": \"pvc_position\"})\n        return pvc_peaks\n\n    @property\n    def labeled_r_peaks_(self) -> pd.DataFrame:\n        \"\"\"All r-peak positions with an additional column that labels them as normal or PVC.\"\"\"\n        self.assert_is_single(None, \"labeled_r_peaks_\")\n        r_peaks = self.r_peak_positions_\n        r_peaks[\"label\"] = \"normal\"\n        r_peaks.loc[r_peaks[\"r_peak_position\"].isin(self.pvc_positions_[\"pvc_position\"]), \"label\"] = \"pvc\"\n        return r_peaks\n\n    def create_index(self) -> pd.DataFrame:\n        participant_ids = [f.name.split(\"_\")[0] for f in sorted(self.data_path.glob(\"*_all.csv\"))]\n        patient_group = [g for g, _ in zip(cycle((\"group_1\", \"group_2\", \"group_3\")), participant_ids)]\n        df = pd.DataFrame({\"patient_group\": patient_group, \"participant\": participant_ids})\n        if len(df) == 0:\n            raise ValueError(\n                \"The dataset is empty. Are you sure you selected the correct folder? Current folder is: \"\n                f\"{self.data_path}\"\n            )\n        return df"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.8.15"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}