TIME_SERIES_STYLE = 'src/time_series.mplstyle'


import pandas as pd
import matplotlib.pyplot as plt


df = pd.read_xml(
    open('data/apple_health_export/dati esportati.xml', 'r'),
    xpath="//Record"
)


df = df.loc[3:] #check this
df = df.drop('MetadataEntry', axis=1)

# Convert to datetime
cols = ['creationDate','startDate','endDate']
for col in cols:
    df[col] = pd.to_datetime(df[col])

# Set as index a middle date between startDate and endDate
df['middleDate'] = df.startDate + (df.endDate - df.startDate) / 2
df = df.set_index('middleDate')


print(df.sample(3).to_markdown())


df.type.unique()

array(['HKQuantityTypeIdentifierStepCount',
       'HKQuantityTypeIdentifierDistanceWalkingRunning',
       'HKQuantityTypeIdentifierFlightsClimbed',
       'HKQuantityTypeIdentifierHeadphoneAudioExposure',
       'HKQuantityTypeIdentifierWalkingDoubleSupportPercentage',
       'HKQuantityTypeIdentifierWalkingSpeed',
       'HKQuantityTypeIdentifierWalkingStepLength',
       'HKQuantityTypeIdentifierWalkingAsymmetryPercentage'], dtype=object)


step_count = df[df.type=='HKQuantityTypeIdentifierStepCount']
walked_km = df[df.type=='HKQuantityTypeIdentifierDistanceWalkingRunning']
climbed_floors = df[df.type=='HKQuantityTypeIdentifierFlightsClimbed']


with plt.style.context(TIME_SERIES_STYLE):
    date_start='2019-10'
    date_end=None
    
    collection = [(step_count, 'n. of steps'), (walked_km, 'n. of km walked'), (climbed_floors, 'n. of floors climbed')]
    

    fig, axs = plt.subplots(3,1, figsize=(20,12), sharex=True)
    axs[0].set_title('Daily information about steps, kilometres and floors', fontsize=17)
    for ax, data in zip(axs, collection):
        d = data[0].resample('D').sum().loc[date_start:date_end].value
        ax.bar(x=d.index, height=d.values)
        ax.grid(axis='x')
        ax.set_ylabel(data[1], fontsize=12)
    
    #plt.savefig('img/output/daily_info.png', transparent=True)


# Average km walked for each week day (hour by hour)
def mean_quantity_eachday_hbh(df, day=None):
    # Hourly resampling
    data = df.resample('H').sum().loc['2019-10':'2020-03-15']
    # Group by '%A %H:%M', i.e. 'DayOfTheWeek Hour:Minute'
    # and compute mean and std
    m = data.groupby(data.index.strftime('%A %H:%M')).mean()
    s = data.groupby(data.index.strftime('%A %H:%M')).std()

    if day:
        m_d = m[m.index.str.contains(day)].cumsum().value.rename(day).values
        s_d = s[s.index.str.contains(day)].value.values
        return m_d, s_d
    else:
        return m, s

def plot_mean_quantity_eachday_hbh(df, list_of_days, fig, ax):
    for day in list_of_days:
        m_d, s_d = mean_quantity_eachday_hbh(df, day)
        ax.plot(range(0,24), m_d, label=day)
        ax.fill_between(range(0,24), m_d-s_d, m_d+s_d, alpha=.1)
    
    ax.set_xticks(range(0,24, 3))  #labels=['00:00', '05:00', '10:00', '15:00', '20:00'])
    ax.set_xticklabels(labels=['00:00', '03:00', '06:00', '09:00', '12:00', '15:00', '18:00', '21:00'])
    ax.set_xlabel('Time', fontsize=12)
    ax.set_ylabel('n. of floors climbed', fontsize=12)
    ax.legend()
    ax.set_title('Average number of flights of stairs climbed \n(data from 2019-10 to 2020-03)',  fontsize=15)
    ax.grid(axis='x')

with plt.style.context(TIME_SERIES_STYLE):
    plot_mean_quantity_eachday_hbh(
        climbed_floors, 
        ['Monday', 'Tuesday','Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], 
        *plt.subplots()
    )
    #plt.savefig('img/output/hourly_floors_climbed.png', transparent=True)

Apple health data analysis¶

Import Data¶

Analysis¶