import pandas as pd
import numpy as np
import pyreadstat
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
from scipy import stats

pd.set_option('mode.chained_assignment', None)


!gen3 drs-pull object dg.4503/8d84511c-76f9-4464-8fdf-6dd668ed9c64

camp_df, camp_meta = pyreadstat.read_sav("camp_teach.sav", apply_value_formats=True)


# Display column names and column description
col_names =  camp_meta.column_names_to_labels
pd.DataFrame(col_names.items(), columns=['Label', 'Name'])


# add age group to the dataframe
def age_group(agelist):
    grouplabel1 = "Early Childhood (2-5yr)"
    grouplabel2 = "Middle Childhood (6-11yr)"
    grouplabel3 = "Early Adolescence (12-18yr)"
    grouplist = []
    for i in agelist:
        if i <= 5:
            grouplist.append(grouplabel1)
        elif i <= 11:
            grouplist.append(grouplabel2)
        elif i >= 12:
            grouplist.append(grouplabel3)
        else:
            grouplist.append("NA")
    return grouplist
camp_df['age_group'] = age_group(camp_df['age_rz'])

first_visit = camp_df.loc[(camp_df["visitc"]=="000")]
first_visit.head(3)


# The row number of the df first_visit shows how many participants were enrolled to the study
first_visit.shape

(695, 29)


# Shows the counts of both genders
first_visit['GENDER'].value_counts()

m    412
f    283
Name: GENDER, dtype: int64


# Plot the composition of age groups by gender among participants in the CAMP study
count_sex_age = pd.crosstab(index=first_visit['age_group'], columns=first_visit['GENDER'])

labels=['Early Adolescence (12-18yr)', 'Early childhood (2-5yr)', 'Middle childhood (6-11yr)']
pie_age_gender = make_subplots(1, 2, specs=[[{'type':'domain'}, {'type':'domain'}]],
                    subplot_titles=['Female', 'Male'])
pie_age_gender.add_trace(go.Pie(labels=labels, values=count_sex_age['f'], scalegroup='one',
                     name="Female"), 1, 1)
pie_age_gender.add_trace(go.Pie(labels=labels, values=count_sex_age['m'], scalegroup='one',
                     name="Male"), 1, 2)

pie_age_gender.update_layout(title_text='Gender and Age Characteristics of CAMP Study',
                         annotations=[dict(text='Female', x=0.225, y=0.47, font_size=15, showarrow=False),
                                      dict(text='Male', x=0.78, y=0.46, font_size=15, showarrow=False)],
                            width=800, height=400)
pie_age_gender.update_traces(hole=.4, hoverinfo="label+value+percent+name")
pie_age_gender.show()


# Plot the composition of ethnicity groups by gender among participants in the CAMP study
count_sex_ethnic = pd.crosstab(index=first_visit['ETHNIC'], columns=first_visit['GENDER'])

ethnic_labels= ["black","hispanic","other","white"]
pie_ethnic_gender = make_subplots(1, 2, specs=[[{'type':'domain'}, {'type':'domain'}]],
                    subplot_titles=['Female', 'Male'])
pie_ethnic_gender.add_trace(go.Pie(labels=ethnic_labels, values=count_sex_ethnic['f'], scalegroup='one',
                     name="Female"), 1, 1)
pie_ethnic_gender.add_trace(go.Pie(labels=ethnic_labels, values=count_sex_ethnic['m'], scalegroup='one',
                     name="Male"), 1, 2)

pie_ethnic_gender.update_layout(title_text='Gender and Ethnicity Characteristics of CAMP Study',
                         annotations=[dict(text='Female', x=0.225, y=0.47, font_size=15, showarrow=False),
                                      dict(text='Male', x=0.78, y=0.46, font_size=15, showarrow=False)],
                               width=800, height=400)
pie_ethnic_gender.update_traces(hole=.4, hoverinfo="label+value+percent+name")
pie_ethnic_gender.show()


# Counts of participants of different treatment groups
first_visit['TX'].value_counts()

ned     210
bud     210
pned    141
pbud    134
Name: TX, dtype: int64


first_visit_rmna = first_visit[first_visit['PREFEVPP'].isna()==False]


# Visualize the boxplots of PREFEVPP of different ethnicity groups at first visit
histo_PREFEVPP_ethnic  = px.box(first_visit_rmna, x='ETHNIC', 
                                y="PREFEVPP", color="GENDER", 
                                title="Boxplot of PREFEVPP at First Visit")
histo_PREFEVPP_ethnic.show()


camp_df['visit_month'] = camp_df['visitc'].astype(int)

visit_month_list = [0,12,24,36,48,60,72]
# extrac id that have PREFEVPP value for all of these visits
all_id = camp_df['id'].unique().tolist()
fig_id_list=[]
for i in all_id:
    i_df = camp_df[(camp_df['id']==i) & (camp_df['PREFEVPP'].isna()==False)]
    i_df_visit =  i_df['visit_month'].tolist()
    if set(visit_month_list).issubset(set(i_df_visit)):
        fig_id_list.append(i)
    else: pass


camp_df_subset = camp_df.loc[camp_df['id'].isin(fig_id_list) & (camp_df['visit_month'].isin(visit_month_list))]
line_PREFEVPP_visit =  px.box(camp_df_subset, x='visit_month', y='PREFEVPP', 
                              color="GENDER",
                              facet_row="TX",
                              width=800, height=800)
line_PREFEVPP_visit.show()


# Define id list that have both records of PREFEVPP at 72 month visit and first visit
def intersection(lst1, lst2):
    return list(set(lst1) & set(lst2))

visit_72_id = camp_df.loc[(camp_df["visitc"]=="072") & (camp_df["PREFEVPP"].isna()==False),]['id'].tolist()
visit_0_id = camp_df.loc[(camp_df["visitc"]=="000") & (camp_df["PREFEVPP"].isna()==False),]['id'].tolist()

id_intersect =intersection(visit_72_id, visit_0_id)

visit_72_df = camp_df.loc[(camp_df['id'].isin(id_intersect)) & (camp_df['visit_month'].isin([72])),['id','PREFEVPP','POSFEVPP']]
visit_0_df = camp_df.loc[(camp_df['id'].isin(id_intersect)) & (camp_df['visit_month'].isin([0])),]

visit_72_df = visit_72_df.rename(columns={"PREFEVPP":"PREFEVPP_72", "POSFEVPP":"POSFEVPP_72"})

# merge two dfs 
fev1_72_df = pd.merge(visit_0_df, visit_72_df, how='inner', on='id')

# create a new variable PREFEVPP_diff that calculates the difference between PREFEVPP value at 72 and 0 month
fev1_72_df['PREFEVPP_diff'] = fev1_72_df['PREFEVPP_72']-fev1_72_df['PREFEVPP']
fev1_72_df.head()


# The number of participants of different treatment groups
fev1_72_df['TX'].value_counts()

bud     178
ned     168
pned    124
pbud    109
Name: TX, dtype: int64


# Here are the histogram distributions of PREFEVPP diff across 4 treatment groups
# The histogram shows that the PREFEVPP_diff follows normal distribution
fig_his = px.histogram(fev1_72_df, x="PREFEVPP_diff", facet_row="TX",
                       title="Histograms of PREFEVPP Diffs Between 72 and 0 Month", 
                       height=600, width=800)
fig_his.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig_his.update_layout(
    xaxis_title="Diff of PREFEVPP between 72 and 0 month"
)
fig_his.show()


# Extract PREFEVPP_diff from each treatment group
bud_diff = fev1_72_df[fev1_72_df['TX']=='bud']['PREFEVPP_diff']
pbud_diff = fev1_72_df[fev1_72_df['TX']=='pbud']['PREFEVPP_diff']
ned_diff = fev1_72_df[fev1_72_df['TX']=='ned']['PREFEVPP_diff']
pned_diff = fev1_72_df[fev1_72_df['TX']=='pned']['PREFEVPP_diff']


# T test between bud treatment group and bud placebo control group 
stats.ttest_ind(bud_diff, pbud_diff, equal_var=False, nan_policy='raise')

Ttest_indResult(statistic=1.5158970151693059, pvalue=0.13114722800749987)


# T test between ned treatment group and ned placebo control group 
stats.ttest_ind(ned_diff, pned_diff, equal_var=False, nan_policy='raise')

Ttest_indResult(statistic=0.20451809019881795, pvalue=0.8380950900486477)

	Label	Name
0	TX	Treatment group: bud,ned,pbud,or pned
1	TG	Treatment group: A=bud, B=ned, C=plbo
2	id	None
3	age_rz	Age in years at Randomization
4	GENDER	m=male, f=female
5	ETHNIC	w=white,b=black,h=hispanic,o=other
6	hemog	Hemoglobin (g/dl)
7	PREFEV	PreBD FEV1
8	PREFVC	PreBD FVC
9	PREFF	PreBD FEV1/FVC ratio %
10	PREPF	PreBD peak flow
11	POSFEV	PostBD FEV1
12	POSFVC	PostBD FVC
13	POSFF	PostBD FEV1/FVC ratio %
14	POSPF	PostBD peak flow
15	PREFEVPP	PreBD FEV1 %pred
16	PREFVCPP	PreBD FVC %pred
17	POSFEVPP	PostBD FEV1 %pred
18	POSFVCPP	PostBD FVC %pred
19	wbc	White Blood Cell count (1000 cells/ul)
20	agehome	Age of current home (years)
21	anypet	Any pets, 1=Yes 2=No
22	woodstove	Used wood stove for heating/cooking, 1=Yes 2=No
23	dehumid	Use a dehumidifier, 1=Yes 2=No 3=DK
24	parent_smokes	Either Parent/partner smokes in home, 1=Yes 2=No
25	any_smokes	Anyone (including visitors) smokes in home, 1=...
26	visitc	Followup Visit (mos)
27	fdays	Days since randomization

	TX	TG	id	age_rz	GENDER	ETHNIC	hemog	PREFEV	PREFVC	PREFF	...	wbc	agehome	anypet	woodstove	dehumid	parent_smokes	any_smokes	age_group
0	ned	B	1.0	5.0	m	o	12.5	1.38	1.75	79.0	...	65.0	50.0	1.0	2.0	2.0	1.0	1.0	Early Childhood (2-5yr)
15	ned	B	2.0	11.0	m	b	12.5	1.78	2.49	71.0	...	82.0	34.0	1.0	2.0	1.0	2.0	1.0	Middle Childhood (6-11yr)
31	ned	B	4.0	7.0	f	w	13.6	2.28	2.62	87.0	...	54.0	34.0	2.0	2.0	2.0	2.0	2.0	Middle Childhood (6-11yr)

	TX	TG	id	age_rz	GENDER	ETHNIC	hemog	PREFEV	PREFVC	PREFF	...	dehumid	parent_smokes	any_smokes	age_group	PREFEVPP_72	POSFEVPP_72	PREFEVPP_diff
0	ned	B	1.0	5.0	m	o	12.5	1.38	1.75	79.0	...	2.0	1.0	1.0	Early Childhood (2-5yr)	92.0	98.0	11.0
1	ned	B	2.0	11.0	m	b	12.5	1.78	2.49	71.0	...	1.0	2.0	1.0	Middle Childhood (6-11yr)	94.0	110.0	4.0
2	ned	B	4.0	7.0	f	w	13.6	2.28	2.62	87.0	...	2.0	2.0	2.0	Middle Childhood (6-11yr)	102.0	111.0	-2.0
3	ned	B	5.0	5.0	m	h	13.8	1.02	1.17	87.0	...	2.0	2.0	2.0	Early Childhood (2-5yr)	92.0	107.0	-20.0
4	bud	A	9.0	12.0	f	w	12.6	1.51	1.84	82.0	...	2.0	2.0	2.0	Early Adolescence (12-18yr)	85.0	103.0	-24.0

Data Exploration of Childhood Asthma Management Program Study Teaching Dataset¶

Import Python libraries¶

Read object file¶

Participants demographic data exploration and visualization¶

Key measurements exploration and visualization¶

Analysis of long-term effect of budesonide and nedocromil on pulmonary function¶