#Option to supress warnings (-1 = suppressed, 0 = unsuppressed)
options(warn=-1)


#Installing required libraries
install.packages(c('tidyverse', 'tidyr', 'haven', 'reshape'))


#Importing libraries
library(tidyverse)
library(tidyr)
library(haven)
library(reshape)


#Importing sav file into a data fram using the haven package
system("gen3 drs-pull object dg.4503/c97507dd-bb79-45ce-9186-3d14ad823f81")

#Checking to see if file exists and importing it into the R environment
if (file.exists("frmgham2.sav")) { 
    print("file successfully created")
    frm_data <- read_sav("frmgham2.sav", user_na = TRUE)
} else {
    print("check your gen3 command")
}

[1] "file successfully created"


#Changing patient ID column type to string                                   
col_to_char <- function(col) {
    for (i in 1:length(col)) {
        col[i] <- as.character(col[i])
    }
    return(col)
}

frm_data$RANDID <- col_to_char(frm_data$RANDID)


#Mapping gender value onto male or female                  
sex_list <- rep(0, length(frm_data$SEX))
counter <- 1                  
for (i in frm_data$SEX) {
    if (i == 1) {
        sex_list[counter] <- "Male"
    } else {
        sex_list[counter] <- "Female"
    }
    counter = counter + 1
}
frm_data$SEX_GROUP <- sex_list


#Filtering subset of data for first visit participants
visit_1 <- frm_data %>% filter(PERIOD == 1)
head(visit_1, n=5)


# Functions to categorize age, BMI, and education value into groups

age_group <- function(age_list) {
  min_age = min(age_list)
  # Define age groups within participants
  glab1 = paste(as.character(min_age), "-40 yr", sep="")
  glab2 = "41-50 yr"  
  glab3 = "51-60 yr"
  glab4 = "61-70 yr"
  glab5 = "71-80 yr"
  glab6 = "81-90 yr"
  counter <- 1
  for (i in age_list) {
        if (is.na(i)) {
          age_list[counter] <- "NA"
        } else if (i <= 40) {
          age_list[counter] <- glab1
        } else if (i <= 50) {
          age_list[counter] <- glab2
        } else if (i <= 60) {
          age_list[counter] <- glab3
        } else if (i <= 70) {
          age_list[counter] <- glab4
        } else if (i <= 80) {
          age_list[counter] <- glab5
        } else if (i <= 90) {
          age_list[counter] <- glab6
        } else {
          age_list[counter] <- "NA"
        }
    counter <- counter + 1
  }          
    return(age_list)
}

bmi_group <- function(bmi_list) {
    # Define BMI groups
    glab1 = "underweight"
    glab2 = "healthy"
    glab3 = "overweight"
    glab4 = "obese"
    counter <- 1
    for (i in bmi_list) {
        if (is.na(i)) {
            bmi_list[counter] <- "NA"
        } else if (i < 18.5) { 
            bmi_list[counter] <- glab1
        } else if (i < 25) {
            bmi_list[counter] <- glab2
        } else if (i < 30) {
            bmi_list[counter] <- glab3
        } else {
            bmi_list[counter] <- glab4
        } 
      counter <- counter + 1
    }
    return(bmi_list)
}

educ_group <- function(edu_list) {
    # Define education groups
    glab1 = "0-11 years"
    glab2 = "High School Diploma, GED"
    glab3 = "Some College, Vocational School"
    glab4 = "College (BS, BA) degree or more"
    counter <- 1
    for (i in edu_list) {
        if (is.na(i)) {
            edu_list[counter] <- "NA"
        } else if (i == 1.0) { 
            edu_list[counter] <- glab1
        } else if (i == 2.0) {
            edu_list[counter] <- glab2
        } else if (i == 3.0) {
            edu_list[counter] <- glab3
        } else if (i == 4.0) {
            edu_list[counter] <- glab4
        } else {
            edu_list[counter] <- "NA"
        }
      counter <- counter + 1
    }
    return(edu_list)
}

#Function to turn the counts into a table in the readable form
to_nice_table <- function(odata, data) {
  c1 <- 1
  c2 <- 1
  for (i in 1:((length(data[, 1])*2))) {
    if (c2 == 1) {
      data[c1, c2] <- odata$n[i]
      c2 <- 2
    } else {
      data[c1, c2] <- odata$n[i]
      c2 <- 1
      c1 <- c1 + 1
    }
  }
  l <- rep(0, length(data[, 1]))
  for (i in 1:length(l)) {
    l[i] <- odata[i*2, 1]
  }
  rownames(data) <- l
  colnames(data) <- c("Female", "Male")
  return(data)
}


#Adding the desired catagories to the visit_1 data
visit_1$AGE_GROUP <- age_group(visit_1$AGE)
visit_1$BMI_GROUP <- bmi_group(visit_1$BMI)
visit_1$EDUC_GROUP <- educ_group(visit_1$EDUC)
head(visit_1 %>% select(AGE, AGE_GROUP, BMI, BMI_GROUP, EDUC, EDUC_GROUP), n=5)


#Placing counts into desired table format 
odata <- visit_1 %>% count(AGE_GROUP, SEX_GROUP)
data <-table(odata$AGE_GROUP, odata$SEX_GROUP)
data <- matrix(data, ncol = 2)
counts1 <- to_nice_table(odata, data)
  
odata <- visit_1 %>% count(BMI_GROUP, SEX_GROUP)
data <-table(odata$BMI_GROUP, odata$SEX_GROUP)
data <- matrix(data, ncol = 2)
counts2 <- to_nice_table(odata, data)
  
odata <- visit_1 %>% count(EDUC_GROUP, SEX_GROUP)
data <-table(odata$EDUC_GROUP, odata$SEX_GROUP)
data <- matrix(data, ncol = 2)
counts3 <- to_nice_table(odata, data)
  
binder <- rbind(counts1, counts2, counts3)
print(binder)

                                Female Male
32-40 yr                           415  339
41-50 yr                           908  731
51-60 yr                           795  584
61-70 yr                           372  290
healthy                           1233  703
NA                                  14    5
obese                              345  232
overweight                         853  992
underweight                         45   12
0-11 years                         979  843
College (BS, BA) degree or more    206  296
High School Diploma, GED           772  509
NA                                  56   57
Some College, Vocational School    477  239


#graphing individual counts of each demographic by sex
options(repr.plot.width=15, repr.plot.height=8)
ggplot(visit_1, aes(x=AGE_GROUP, fill=SEX_GROUP)) + geom_bar() + labs(x="Age Group", y="Count", fill="Sex Group") + ggtitle("Age Group Characteristics of Participants in the Framingham Heart Study")
ggplot(visit_1, aes(x=BMI_GROUP, fill=SEX_GROUP)) + geom_bar() + labs(x="BMI Group", y="Count", fill="Sex Group") + ggtitle("BMI Group Characteristics of Participants in the Framingham Heart Study")
ggplot(visit_1, aes(x=EDUC_GROUP, fill=SEX_GROUP)) + geom_bar() + theme(axis.text.x = element_text(angle = 45, hjust=1)) + labs(x="Education", y="Count", fill="Sex Group") + ggtitle("Education Group Characteristics of Participants in the Framingham Heart Study")


#adding demographic information to our intiial frm_data
frm_data$AGE_GROUP = age_group(frm_data$AGE)
frm_data$BMI_GROUP = bmi_group(frm_data$BMI)
frm_data$EDUC_GROUP = educ_group(frm_data$EDUC)
head(frm_data, n=5)


# extract a subset of data for risk factor exploration
rf_data <- frm_data %>% select(RANDID, SEX_GROUP, TIME, PERIOD, BMI, AGE, SYSBP, DIABP, CIGPDAY, TOTCHOL, HDLC, LDLC, GLUCOSE)


#Pivoting data to longer format
rf_data_melt <- pivot_longer(rf_data, c(BMI, AGE, SYSBP, DIABP, CIGPDAY, TOTCHOL, HDLC, LDLC, GLUCOSE), names_to = "variable", values_to = "value")
head(rf_data_melt, n=5)


#Bar and Violin plot of AGE and BMI factors at the three visits
dodge <- position_dodge(width = 0.4)
options(repr.plot.width=13, repr.plot.height=8)
ggplot((rf_data_melt %>% filter(variable == "BMI" | variable == "AGE") %>% drop_na()), aes(x=factor(PERIOD),y=value,fill=variable))+geom_violin(position = dodge) + geom_boxplot(width=.1, position = dodge) + labs(x= "Visit", y="Value", fill="Variable")


#Bar plot of more risk factors at the three visits
options(repr.plot.width=13, repr.plot.height=8)
ggplot((rf_data_melt %>% filter(variable == "SYSBP" | variable == "DIABP" | variable == "TOTCHOL" | variable == "GLUCOSE") %>% drop_na()), aes(x=factor(PERIOD),y=value,fill=variable)) + geom_boxplot(width=.7) + labs(x= "Visit", y="Value", fill="Variable")


#Take only the final visit and the known risk factors for cardiovascular disease
visit_fin <- frm_data[!rev(duplicated(rev(frm_data$RANDID))),] 
visit_rf <- visit_fin %>% select(RANDID, AGE, SEX_GROUP, CURSMOKE, TOTCHOL, BMI, GLUCOSE, PREVCHD, PREVSTRK, PREVHYP, DIABETES, HEARTRTE)

col_rf <- function(column, criteria, eorg) {
  counter <- 1
  if (eorg) {
    for (i in column) {
      if (!(is.na(i))) {    
        if (i >= criteria) {
          column[counter] <- 1
        } else {
          column[counter] <- 0
        }
      counter <- counter + 1
      } else {
        column[counter] <- 0
      }
    }
  } else {
    for (i in column) {
      if (!(is.na(i))) {  
        if (i > criteria) {
          column[counter] <- 1
        } else {
          column[counter] <- 0
        }
      } else {
        column[counter] <- 0
      }
      counter <- counter + 1
    }
  }
  return(column)
}

visit_rf$TOTCHOL_RF <- col_rf(visit_rf$TOTCHOL, 200, FALSE)
visit_rf$BMI_RF <- col_rf(visit_rf$BMI, 25, FALSE)
visit_rf$GLUCOSEL_RF <- col_rf(visit_rf$GLUCOSE, 200, FALSE)
visit_rf$HEARTRTE_RF <- col_rf(visit_rf$HEARTRTE, 100, FALSE)
visit_rf$AGE_RF <- col_rf(visit_rf$AGE, 60, TRUE)

visit_rf <- visit_rf %>% select(-AGE,-TOTCHOL,-BMI,-GLUCOSE, -HEARTRTE, -SEX_GROUP)
head(visit_rf)


#Combining risk factors with event data
event_data <- visit_fin %>% select(ANGINA, HOSPMI, MI_FCHD, ANYCHD, STROKE, CVD, HYPERTEN, DEATH)
visit_erf <- cbind(visit_rf, event_data)
head(visit_erf)


#Formatting data for heat map
visit_rf_noid <- visit_erf %>% select(-RANDID)
vrn <- cor(as.matrix(visit_rf_noid))
heat_data <- melt(vrn)
colnames(heat_data) <- c("x", "y", "value")


#Graphing the correlation heat map
options(repr.plot.width=9.5, repr.plot.height=8)
ggplot(heat_data, aes(x = x, y = y, fill = value)) +geom_tile() + theme(axis.text.x = element_text(angle = 45, hjust=1))


#Creating column that combines all of the risk factors
visit_erf$RF_SUM <- with(visit_erf, ( CURSMOKE + PREVCHD + PREVSTRK + PREVHYP + DIABETES + TOTCHOL_RF + BMI_RF + GLUCOSEL_RF + HEARTRTE_RF + AGE_RF))
tail(visit_erf)


#Creating column that combines all of the event risk factors
visit_erf$EVENT_SUM <- with(visit_erf, (ANYCHD + STROKE + CVD + HYPERTEN))


rf_count <- visit_erf %>% count(RF_SUM, EVENT_SUM)
tabled_count <- matrix(nrow= 9, ncol=5)
  
for (i in 1:45) {    
    tabled_count[rf_count$RF_SUM[i]+1, rf_count$EVENT_SUM[i]+1] <- rf_count$n[i]
}
c1 <- 1
c2 <- 1
for (i in 1:45) {
    if (c2 < 5) {
        if(is.na(tabled_count[c1, c2])) {
            tabled_count[c1, c2] <- 0
      } 
        c2 <- c2 + 1
    } else {
        if(is.na(tabled_count[c1, c2])) {
            tabled_count[c1, c2] <- 0
      } 
      c2 <- 1
      c1 <- c1 + 1
    }
  }

rownames(tabled_count) <- c("0.0", "1.0", "2.0", "3.0", "4.0", "5.0", "6.0", "7.0", "8.0")
colnames(tabled_count) <- c("0.0", "1.0", "2.0", "3.0", "4.0")

print(tabled_count)

    0.0 1.0 2.0 3.0 4.0
0.0  60  15   1   0   0
1.0 250 152  19  14   2
2.0 413 472  77  56   4
3.0 203 741 123 233  28
4.0  31 552 105 297  52
5.0   0 114  73 169  44
6.0   0  13  20  41  32
7.0   0   2   1  11  10
8.0   0   0   0   2   2


#graphing
options(repr.plot.width=20, repr.plot.height=10)
ggplot(rf_count, aes(x=factor(RF_SUM), y=n,  fill=factor(EVENT_SUM))) + geom_bar(stat="identity") + labs(x="RF Sum", y="count", fill="Event Sum") + scale_fill_manual(values=c("#1cd600", "#a11e11", "#80180d", "#60120a", "#400c06")) + ggtitle("Risk Factors and Disease Events")

RANDID	SEX	TOTCHOL	AGE	SYSBP	DIABP	CURSMOKE	CIGPDAY	BMI	DIABETES	⋯	HYPERTEN	TIMEAP	TIMEMI	TIMEMIFC	TIMECHD	TIMESTRK	TIMECVD	TIMEDTH	TIMEHYP	SEX_GROUP
<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	⋯	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<chr>
2448	1	195	39	106.0	70	0	0	26.97	0	⋯	0	8766	6438	6438	6438	8766	6438	8766	8766	Male
6238	2	250	46	121.0	81	0	0	28.73	0	⋯	0	8766	8766	8766	8766	8766	8766	8766	8766	Female
9428	1	245	48	127.5	80	1	20	25.34	0	⋯	0	8766	8766	8766	8766	8766	8766	8766	8766	Male
10552	2	225	61	150.0	95	1	30	28.58	0	⋯	1	2956	2956	2956	2956	2089	2089	2956	0	Female
11252	2	285	46	130.0	84	1	23	23.10	0	⋯	1	8766	8766	8766	8766	8766	8766	8766	4285	Female

RANDID	SEX	TOTCHOL	AGE	SYSBP	DIABP	CURSMOKE	CIGPDAY	BMI	DIABETES	⋯	TIMEMIFC	TIMECHD	TIMESTRK	TIMECVD	TIMEDTH	TIMEHYP	SEX_GROUP	AGE_GROUP	BMI_GROUP	EDUC_GROUP
<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	⋯	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<chr>	<chr>	<chr>	<chr>
2448	1	195	39	106	70.0	0	0	26.97	0	⋯	6438	6438	8766	6438	8766	8766	Male	32-40 yr	overweight	College (BS, BA) degree or more
2448	1	209	52	121	66.0	0	0	NA	0	⋯	6438	6438	8766	6438	8766	8766	Male	51-60 yr	NA	College (BS, BA) degree or more
6238	2	250	46	121	81.0	0	0	28.73	0	⋯	8766	8766	8766	8766	8766	8766	Female	41-50 yr	overweight	High School Diploma, GED
6238	2	260	52	105	69.5	0	0	29.43	0	⋯	8766	8766	8766	8766	8766	8766	Female	51-60 yr	overweight	High School Diploma, GED
6238	2	237	58	108	66.0	0	0	28.50	0	⋯	8766	8766	8766	8766	8766	8766	Female	51-60 yr	overweight	High School Diploma, GED

RANDID	CURSMOKE	PREVCHD	PREVSTRK	PREVHYP	DIABETES	TOTCHOL_RF	BMI_RF	GLUCOSEL_RF	HEARTRTE_RF	AGE_RF
<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
2448	0	0	0	0	0	1	0	0	0	0
6238	0	0	0	0	0	1	1	0	0	0
9428	1	0	0	0	0	1	1	0	0	0
10552	1	0	0	1	0	1	1	0	0	1
11252	1	0	0	1	0	0	0	0	0	0
11263	0	0	0	1	1	1	1	0	0	0

	RANDID	CURSMOKE	PREVCHD	PREVSTRK	PREVHYP	DIABETES	TOTCHOL_RF	BMI_RF	GLUCOSEL_RF	HEARTRTE_RF	AGE_RF	ANGINA	HOSPMI	MI_FCHD	ANYCHD	STROKE	CVD	HYPERTEN	DEATH
	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
1	2448	0	0	0	0	0	1	0	0	0	0	0	1	1	1	0	1	0	0
2	6238	0	0	0	0	0	1	1	0	0	0	0	0	0	0	0	0	0	0
3	9428	1	0	0	0	0	1	1	0	0	0	0	0	0	0	0	0	0	0
4	10552	1	0	0	1	0	1	1	0	0	1	0	0	0	0	1	1	1	1
5	11252	1	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	1	0
6	11263	0	0	0	1	1	1	1	0	0	0	0	0	1	1	0	1	1	0

	RANDID	CURSMOKE	PREVCHD	PREVSTRK	PREVHYP	DIABETES	TOTCHOL_RF	BMI_RF	GLUCOSEL_RF	HEARTRTE_RF	AGE_RF	ANGINA	HOSPMI	MI_FCHD	ANYCHD	STROKE	CVD	HYPERTEN	DEATH	RF_SUM
	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
4429	9989287	1	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	1	2
4430	9990894	1	0	0	1	0	1	0	0	0	1	0	0	0	0	0	0	1	1	4
4431	9993179	1	0	0	1	0	1	0	0	0	0	0	0	0	0	0	0	1	1	3
4432	9995546	0	0	0	1	0	1	0	0	0	0	1	0	1	1	0	1	1	0	2
4433	9998212	0	0	0	1	0	0	1	0	0	0	0	0	0	0	0	0	1	0	2
4434	9999312	1	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	1	0	2

Data Exploration of Framingham Heart Study Teaching Dataset In R¶

Install and set the required R Libraries¶

Pull the Framingham data file¶

Basic data manipulation¶

Visualizing the manipulated data¶

Risk Factor Exploration¶

Catagorizing Risk Factors¶

Correlation between risk and event factors¶