import pandas as pd
import numpy as np
import pyreadstat
import seaborn as sns
import matplotlib.pyplot as plt

# Modules for regression
import statsmodels.api as sm  # For statistical models
from statsmodels.sandbox.regression.predstd import wls_prediction_std # For prediction interval
from sklearn.preprocessing import StandardScaler  # For feature scaling
from sklearn.linear_model import LogisticRegression  # For logistic regression
from sklearn.model_selection import train_test_split  # For data splitting
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report  # For model evaluation


# Read data from "frmgham2.sav" file using pyreadstat library
fram_df, fram_meta = pyreadstat.read_sav("frmgham2.sav", apply_value_formats=True)

# Display the first few rows of the DataFrame
fram_df.head()


# Set option to display all columns in DataFrame
pd.set_option('display.max_columns', None)

# Print the shape (rows, columns) of the DataFrame
fram_df.shape

(11627, 39)


fram_df.dtypes

RANDID      float64
SEX         float64
TOTCHOL     float64
AGE         float64
SYSBP       float64
DIABP       float64
CURSMOKE    float64
CIGPDAY     float64
BMI         float64
DIABETES    float64
BPMEDS      float64
HEARTRTE    float64
GLUCOSE     float64
EDUC        float64
PREVCHD     float64
PREVAP      float64
PREVMI      float64
PREVSTRK    float64
PREVHYP     float64
TIME        float64
PERIOD      float64
HDLC        float64
LDLC        float64
DEATH       float64
ANGINA      float64
HOSPMI      float64
MI_FCHD     float64
ANYCHD      float64
STROKE      float64
CVD         float64
HYPERTEN    float64
TIMEAP      float64
TIMEMI      float64
TIMEMIFC    float64
TIMECHD     float64
TIMESTRK    float64
TIMECVD     float64
TIMEDTH     float64
TIMEHYP     float64
dtype: object


fram_meta.column_names_to_labels

{'RANDID': 'Random ID',
 'SEX': 'SEX',
 'TOTCHOL': 'Serum Cholesterol mg/dL',
 'AGE': 'Age (years) at examination',
 'SYSBP': 'Systolic BP mmHg',
 'DIABP': 'Diastolic BP mmHg',
 'CURSMOKE': 'Current Cig Smoker Y/N',
 'CIGPDAY': 'Cigarettes per day',
 'BMI': 'Body Mass Index (kg/(M*M)',
 'DIABETES': 'Diabetic Y/N',
 'BPMEDS': 'Anti-hypertensive meds Y/N',
 'HEARTRTE': 'Ventricular Rate (beats/min)',
 'GLUCOSE': 'Casual Glucose mg/dL',
 'EDUC': '0-11 years, HS or GED, Some Coll, Coll Grad+',
 'PREVCHD': 'Prevalent CHD (MI,AP,CI)',
 'PREVAP': 'Prevalent Angina',
 'PREVMI': 'Prevalent MI (Hosp,Silent)',
 'PREVSTRK': 'Prevalent Stroke (Infarct,Hem)',
 'PREVHYP': 'Prevalent Hypertension',
 'TIME': 'Days since Index Exam',
 'PERIOD': 'Examination cycle',
 'HDLC': 'HDL Cholesterol mg/dL',
 'LDLC': 'LDL Cholesterol mg/dL',
 'DEATH': 'Death indicator',
 'ANGINA': 'Incident Angina Pectoris',
 'HOSPMI': 'Incident Hospitalized MI',
 'MI_FCHD': 'Incident Hosp MI-Fatal CHD',
 'ANYCHD': 'Incident Hosp MI, AP, CI, Fatal CHD',
 'STROKE': 'Incident Stroke Fatal/non-fatal',
 'CVD': 'Incident Hosp MI or Stroke, Fatal or Non',
 'HYPERTEN': 'Incident Hypertension',
 'TIMEAP': 'Days Baseline-Inc Angina',
 'TIMEMI': 'Days Baseline-Inc Hosp MI',
 'TIMEMIFC': 'Days Baseline-Inc MI-Fatal CHD',
 'TIMECHD': 'Days Baseline-Inc Any CHD',
 'TIMESTRK': 'Days Baseline-Inc Stroke',
 'TIMECVD': 'Days Baseline-Inc CVD',
 'TIMEDTH': 'Days Baseline-Death',
 'TIMEHYP': 'Days Baseline-Inc Hypertension'}


# Select specific columns from the DataFrame
fram_df = fram_df[['RANDID', 'SEX', 'AGE', 'SYSBP', 'DIABP', 'CURSMOKE', 'CIGPDAY', 'BMI', 'TOTCHOL', 'HDLC', 'LDLC', 'GLUCOSE', 'DIABETES',
                   'PERIOD', 'DEATH', 'ANYCHD', 'STROKE', 'CVD', 'HYPERTEN']]
# Create a mapping dictionary for converting numeric gender values to labels
sexMap = {1: 'Male', 2: 'Female'}
# Add a new 'GENDER' column based on the 'SEX' column using the mapping dictionary
new_col = fram_df['SEX'].map(sexMap)
fram_df['GENDER'] = new_col


fram_1 = fram_df[fram_df['PERIOD']==1]
fram_2 = fram_df[fram_df['PERIOD']==2]
fram_3 = fram_df[fram_df['PERIOD']==3]


print('There are '+ str(fram_1['RANDID'].duplicated().sum())+' duplicates for period 1')
print('There are '+ str(fram_2['RANDID'].duplicated().sum())+' duplicates for period 2')
print('There are '+ str(fram_3['RANDID'].duplicated().sum())+' duplicates for period 3')

There are 0 duplicates for period 1
There are 0 duplicates for period 2
There are 0 duplicates for period 3


fram_df.describe().T


# Melt the DataFrame for specific variables
df_melted_1 = fram_df.melt(var_name='Variables', value_name='Values', value_vars=['TOTCHOL', 'HDLC', 'LDLC', 'GLUCOSE'])
df_melted_2 = fram_df.melt(var_name='Variables', value_name='Values', value_vars=['AGE', 'SYSBP', 'DIABP', 'CIGPDAY', 'BMI'])

# Create a subplot with two columns for side-by-side plots
plt.figure(figsize=(10, 5))

# Plot the first box plot on the left side
plt.subplot(1, 2, 1)
sns.boxplot(x='Variables', y='Values', data=df_melted_1, palette='Pastel2')
plt.title('Box Plot for Serum Total Cholesterol, High Density\n Lipoprotein Cholesterol, Low Density Lipoprotein\n Cholesterol, and Casual serum glucose')
plt.xlabel('Variables')
plt.ylabel('Values')

# Plot the second box plot on the right side
plt.subplot(1, 2, 2)
sns.boxplot(x='Variables', y='Values', data=df_melted_2, palette='Pastel1')
plt.title('Box Plot for Age, Systolic Blood Pressure, Diastolic\n Blood Pressure, Number of cigarettes smoked each day,\n Body Mass Index, and Heart rate')
plt.xlabel('Variables')
plt.ylabel('Values')

# Adjust layout and display the plots
plt.tight_layout()
plt.show()


# Create a figure with 1 row and 3 columns of subplots, sharing the same x and y axes
fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharex=True, sharey=True)

# Arrange the DataFrames for each period and sort them by gender
data_frames = [fram_1.sort_values(by=['GENDER']), fram_2.sort_values(by=['GENDER']), fram_3.sort_values(by=['GENDER'])]

# Titles for the subplots
titles = ['Period 1', 'Period 2', 'Period 3']

# Loop through the data frames and corresponding axes
for i, df in enumerate(data_frames):
    # Create a histogram plot with stacked bars for different genders, and overlay KDE
    sns.histplot(data=df, x='AGE', hue='GENDER', palette='viridis', multiple='stack', edgecolor='white', ax=axes[i], bins=38)

    # Set x and y labels for the current subplot
    axes[i].set_xlabel('Age')
    axes[i].set_ylabel('Frequency')

    # Set the title for the current subplot
    axes[i].set_title(titles[i])

# Adjust layout and display the subplots
plt.tight_layout()
plt.show()


# Create a 1x3 grid of subplots
fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharex=True, sharey=True)

# List of DataFrames for each period, sorted by gender
data_frames = [fram_1.sort_values(by=['GENDER']), fram_2.sort_values(by=['GENDER']), fram_3.sort_values(by=['GENDER'])]

# Titles for the subplots
titles = ['Period 1', 'Period 2', 'Period 3']

# Loop through the DataFrames and corresponding axes
for i, df in enumerate(data_frames):
    # Create a histogram plot of BMI, colored by gender
    sns.histplot(data=df, x='BMI', hue='GENDER', palette='coolwarm', multiple='layer', edgecolor='white', ax=axes[i])

    # Set x and y labels for the current subplot
    axes[i].set_xlabel('Body Mass Index (weight in kg/height m^2)')
    axes[i].set_ylabel('Frequency')

    # Set the title for the current subplot
    axes[i].set_title(titles[i])

# Adjust layout and display the subplots
plt.tight_layout()
plt.show()


# Create a 1x3 grid of subplots
fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharex=True, sharey=True)

# List of DataFrames for each period, sorted by gender
data_frames = [fram_1.sort_values(by=['GENDER']), fram_2.sort_values(by=['GENDER']), fram_3.sort_values(by=['GENDER'])]

# Titles for the subplots
titles = ['Period 1', 'Period 2', 'Period 3']

# Loop through the DataFrames and corresponding axes
for i, df in enumerate(data_frames):
    # Create a histogram plot of Systolic Blood Pressure, colored by gender
    sns.histplot(data=df, x='SYSBP', hue='GENDER', palette='viridis', multiple='layer', edgecolor='white', ax=axes[i])

    # Set x and y labels for the current subplot
    axes[i].set_xlabel('Systolic Blood Pressure (mmHg)')
    axes[i].set_ylabel('Frequency')

    # Set the title for the current subplot
    axes[i].set_title(titles[i])

# Adjust layout and display the subplots
plt.tight_layout()
plt.show()


# Create a 1x3 grid of subplots
fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharex=True, sharey=True)

# List of DataFrames for each period, sorted by gender
data_frames = [fram_1.sort_values(by=['GENDER']), fram_2.sort_values(by=['GENDER']), fram_3.sort_values(by=['GENDER'])]

# Titles for the subplots
titles = ['Period 1', 'Period 2', 'Period 3']

# Loop through the DataFrames and corresponding axes
for i, df in enumerate(data_frames):
    # Create a histogram plot of Serum Total Cholesterol, colored by gender
    sns.histplot(data=df, x='TOTCHOL', hue='GENDER', palette='magma', multiple='layer', edgecolor='white', ax=axes[i])

    # Set x and y labels for the current subplot
    axes[i].set_xlabel('Serum Total Cholesterol (mg/dL)')
    axes[i].set_ylabel('Frequency')

    # Set the title for the current subplot
    axes[i].set_title(titles[i])

# Adjust layout and display the subplots
plt.tight_layout()
plt.show()


# Create a new column that combines 'ANYCHD' and 'STROKE', counting for all indicators of cardiovascular diseases
fram_1.loc[:,'ANYCHD_OR_STROKE'] = (fram_1['ANYCHD'] + fram_1['STROKE'] > 0).astype(int)

# Create new DataFrame
df = fram_1[['SEX','TOTCHOL','AGE','SYSBP','DIABP','CURSMOKE','BMI','DIABETES','GLUCOSE','HYPERTEN','ANYCHD_OR_STROKE']]

# Calculate the correlation matrix
correlation_matrix = df.corr()

# Create the heat map with a color range of -1 to 1
plt.figure(figsize=(8, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5, vmin=-1, vmax=1)
plt.title('Correlation Heat Map')
plt.show()


# Create a contingency table
contingency_table = pd.crosstab(df['SEX'], df['ANYCHD_OR_STROKE'])

# Rename the rows (SEX categories) and columns (ANYCHD_OR_STROKE categories)
contingency_table = contingency_table.rename(
    index={1: 'Male', 2: 'Female'},
    columns={0: 'No', 1: 'Yes'}
)

# Calculate the percentage of "Yes" (positive) outcomes
contingency_table['percentage'] = contingency_table['Yes'] / (contingency_table['Yes'] + contingency_table['No'])

# Print the resulting contingency table
print(contingency_table)

ANYCHD_OR_STROKE    No  Yes  percentage
SEX                                    
Male              1128  816    0.419753
Female            1830  660    0.265060


# Set the style to 'whitegrid'
sns.set(style='whitegrid')

# Create scatter plots for each variable against 'SYSBP'
plt.figure(figsize=(15, 4))

# Plot 'TOTCHOL' against 'SYSBP'
plt.subplot(1, 3, 1)
x_jitter = 10 * np.random.randn(df['TOTCHOL'].size)  # Add jitter to x-axis data
y_jitter = 4 * np.random.randn(df['SYSBP'].size)      # Add jitter to y-axis data
sns.scatterplot(data=df, x=df['TOTCHOL'] + x_jitter, y=df['SYSBP'] + y_jitter, s=5, alpha=0.5)
plt.xlabel('Serum Total Cholesterol (mg/dL)')
plt.ylabel('Systolic Blood Pressure (mmHg)')
plt.xlim(100, 450)  # Set x-axis limits
plt.ylim(75, 225)   # Set y-axis limits

# Plot 'BMI' against 'SYSBP'
plt.subplot(1, 3, 2)
x_jitter = 1 * np.random.randn(df['BMI'].size)        # Add jitter to x-axis data
y_jitter = 4 * np.random.randn(df['SYSBP'].size)      # Add jitter to y-axis data
sns.scatterplot(data=df, x=df['BMI'] + x_jitter, y=df['SYSBP'] + y_jitter, s=5, alpha=0.5)
plt.xlabel('BMI (kg/m^2)')
plt.ylabel('Systolic Blood Pressure (mmHg)')
plt.xlim(10, 45)    # Set x-axis limits
plt.ylim(75, 225)   # Set y-axis limits

# Plot 'AGE' against 'SYSBP'
plt.subplot(1, 3, 3)
x_jitter = 1 * np.random.randn(df['AGE'].size)        # Add jitter to x-axis data
y_jitter = 4 * np.random.randn(df['SYSBP'].size)      # Add jitter to y-axis data
sns.scatterplot(data=fram_df, x=df['AGE'] + x_jitter, y=df['SYSBP'] + y_jitter, s=5, alpha=0.5)
plt.xlabel('Age')
plt.ylabel('Systolic Blood Pressure (mmHg)')
plt.xlim(30, 70)    # Set x-axis limits
plt.ylim(75, 225)   # Set y-axis limits

# Adjust the layout to avoid overlapping of titles
plt.tight_layout()

# Display the plots
plt.show()


# Create a figure with a grid of 1 row and 3 columns, specifying the figure size
plt.figure(figsize=(15, 4))

# Create the first subplot for 'TOTCHOL' vs 'SYSBP'
plt.subplot(1, 3, 1)
# Generate a density-based colored scatter plot using kdeplot
sns.kdeplot(data=df, x=df['TOTCHOL'], y=df['SYSBP'], fill=True, cmap='Blues', levels=20)
# Set x-axis and y-axis labels
plt.xlabel('Serum Total Cholesterol (mg/dL)')
plt.ylabel('Systolic Blood Pressure (mmHg)')
# Adjust x-axis and y-axis limits
plt.xlim(100, 450)
plt.ylim(75, 225)

# Create the second subplot for 'BMI' vs 'SYSBP'
plt.subplot(1, 3, 2)
# Generate a density-based colored scatter plot using kdeplot
sns.kdeplot(data=df, x=df['BMI'], y=df['SYSBP'], fill=True, cmap='Purples', levels=20)
# Set x-axis and y-axis labels
plt.xlabel('BMI (kg/m^2)')
plt.ylabel('Systolic Blood Pressure (mmHg)')
# Adjust x-axis and y-axis limits
plt.xlim(10, 45)
plt.ylim(75, 225)

# Create the third subplot for 'AGE' vs 'SYSBP'
plt.subplot(1, 3, 3)
# Generate a density-based colored scatter plot using kdeplot
sns.kdeplot(data=df, x=df['AGE'], y=df['SYSBP'], fill=True, cmap='Reds', levels=20)
# Set x-axis and y-axis labels
plt.xlabel('Age')
plt.ylabel('Systolic Blood Pressure (mmHg)')
# Adjust x-axis and y-axis limits
plt.xlim(30, 70)
plt.ylim(75, 225)

# Adjust the layout to prevent overlapping of subplot titles
plt.tight_layout()

# Display the plots
plt.show()


# List of columns you want to filter
columns_to_filter = ['TOTCHOL', 'SYSBP', 'DIABP', 'BMI', 'GLUCOSE']

# Calculate z-scores for the specified columns
z_scores = (df[columns_to_filter] - df[columns_to_filter].mean()) / df[columns_to_filter].std()

# Find indices where z-scores are beyond ±3 standard deviations for each column
outlier_indices = z_scores[(z_scores > 3) | (z_scores < -3)].dropna().index

# Drop the rows with outlier indices from the DataFrame
filtered_df = df.drop(outlier_indices)


# Print the shape of the filtered DataFrame to see the number of rows and columns
print(filtered_df.shape)

# Check for missing values in the filtered DataFrame and sum them up for each column
filtered_df.isnull().sum()

(4434, 11)

SEX                   0
TOTCHOL              52
AGE                   0
SYSBP                 0
DIABP                 0
CURSMOKE              0
BMI                  19
DIABETES              0
GLUCOSE             397
HYPERTEN              0
ANYCHD_OR_STROKE      0
dtype: int64


#Drop rows with missing values from the DataFrame
filtered_df.dropna(inplace=True)
#Display the new shape of the DataFrame
filtered_df.shape

(4013, 11)


# Select the predictor variable (X) and the target variable (y)
X = filtered_df['BMI']
y = filtered_df['SYSBP']

# Add a constant term to the predictor variable
X = sm.add_constant(X)

# Fit the OLS (ordinary least squares) model
model = sm.OLS(y, X).fit()

# Print the summary of the regression
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  SYSBP   R-squared:                       0.108
Model:                            OLS   Adj. R-squared:                  0.108
Method:                 Least Squares   F-statistic:                     485.0
Date:                Wed, 23 Aug 2023   Prob (F-statistic):          1.44e-101
Time:                        04:31:14   Log-Likelihood:                -17959.
No. Observations:                4013   AIC:                         3.592e+04
Df Residuals:                    4011   BIC:                         3.593e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         86.3001      2.149     40.159      0.000      82.087      90.513
BMI            1.8083      0.082     22.023      0.000       1.647       1.969
==============================================================================
Omnibus:                      807.041   Durbin-Watson:                   1.985
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1735.591
Skew:                           1.162   Prob(JB):                         0.00
Kurtosis:                       5.232   Cond. No.                         168.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


# Calculate the residuals
residuals = model.resid

# Create residual plots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Residuals vs Fitted values plot
ax1.scatter(X['BMI'], residuals, alpha=0.6, s=1)
ax1.axhline(y=0, color='red', linestyle='--')
ax1.set_xlabel('BMI')
ax1.set_ylabel('Residuals')
ax1.set_title('Residuals Plot')

# Q-Q Plot (Quantile-Quantile Plot) of the residuals
sm.qqplot(residuals, line='s', ax=ax2, markersize=1)
ax2.set_title('Q-Q Plot of Residuals')

plt.tight_layout()
plt.show()


# Plot the regression results along with confidence interval
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(111)
ax.scatter(filtered_df['BMI'], y, s=1, alpha=0.6, label="Data")
ax.plot(filtered_df['BMI'].to_numpy(), model.predict(X).to_numpy(), color='red', label="Regression Line")
ax.set_xlabel('BMI')
ax.set_ylabel('SYSBP')

## Plot the prediction interval
prstd, iv_l, iv_u = wls_prediction_std(model)
ax.plot(filtered_df['BMI'].to_numpy(), iv_u.to_numpy(), color='grey', linestyle='--', label='95% Prediction Interval')
ax.plot(filtered_df['BMI'].to_numpy(), iv_l.to_numpy(), color='grey', linestyle='--')

# Plot the confidence intervals
predict_mean_ci = model.get_prediction(X).conf_int()
ax.plot(filtered_df['BMI'].to_numpy(), predict_mean_ci[:, 0], color='purple', linestyle='--', linewidth=0.5, label='95% Confidence Interval')
ax.plot(filtered_df['BMI'].to_numpy(), predict_mean_ci[:, 1], color='purple', linestyle='--', linewidth=0.5)

# Display the legend
ax.legend()
plt.show()


# Select predictor variables (X) and the target variable (y)
X = filtered_df[['TOTCHOL', 'AGE', 'BMI', 'GLUCOSE']]
y = filtered_df['SYSBP']  # Replace with the actual target variable name

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the predictor variables and transform them
X_scaled_values = scaler.fit_transform(X)

# Create a new DataFrame with scaled values and original column names
X_scaled = pd.DataFrame(X_scaled_values, columns=X.columns)

# Add a constant term to the predictor variables
X_scaled_with_constant = sm.add_constant(X_scaled)

# Reset the index of the target variable y
y_reset = y.reset_index(drop=True)

# Fit the multivariable linear regression model
model = sm.OLS(y_reset, X_scaled_with_constant).fit()

# Print the summary of the regression results
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  SYSBP   R-squared:                       0.244
Model:                            OLS   Adj. R-squared:                  0.243
Method:                 Least Squares   F-statistic:                     323.2
Date:                Wed, 23 Aug 2023   Prob (F-statistic):          2.39e-241
Time:                        04:31:27   Log-Likelihood:                -17627.
No. Observations:                4013   AIC:                         3.526e+04
Df Residuals:                    4008   BIC:                         3.529e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        133.0470      0.309    430.639      0.000     132.441     133.653
TOTCHOL        1.9756      0.320      6.167      0.000       1.348       2.604
AGE            7.4275      0.323     23.024      0.000       6.795       8.060
BMI            6.0429      0.314     19.249      0.000       5.427       6.658
GLUCOSE        1.2864      0.312      4.119      0.000       0.674       1.899
==============================================================================
Omnibus:                      637.733   Durbin-Watson:                   2.008
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1374.770
Skew:                           0.935   Prob(JB):                    2.97e-299
Kurtosis:                       5.174   Cond. No.                         1.38
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


X = filtered_df[['SEX', 'TOTCHOL', 'AGE', 'SYSBP', 'BMI', 'GLUCOSE']]
y = filtered_df['ANYCHD_OR_STROKE']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Create and fit the logistic regression model
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = logistic_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_mat)
print("Classification Report:\n", classification_rep)

Accuracy: 0.7014925373134329
Confusion Matrix:
 [[239  25]
 [ 95  43]]
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.91      0.80       264
           1       0.63      0.31      0.42       138

    accuracy                           0.70       402
   macro avg       0.67      0.61      0.61       402
weighted avg       0.69      0.70      0.67       402

	RANDID	SEX	TOTCHOL	AGE	SYSBP	DIABP	BMI	...	CVD	TIMEAP	TIMEMI	TIMEMIFC	TIMECHD	TIMESTRK	TIMECVD	TIMEDTH	TIMEHYP
0	2448.0	1.0	195.0	39.0	106.0	70.0	26.97	...	1.0	8766.0	6438.0	6438.0	6438.0	8766.0	6438.0	8766.0	8766.0
1	2448.0	1.0	209.0	52.0	121.0	66.0	NaN	...	1.0	8766.0	6438.0	6438.0	6438.0	8766.0	6438.0	8766.0	8766.0
2	6238.0	2.0	250.0	46.0	121.0	81.0	28.73	...	0.0	8766.0	8766.0	8766.0	8766.0	8766.0	8766.0	8766.0	8766.0
3	6238.0	2.0	260.0	52.0	105.0	69.5	29.43	...	0.0	8766.0	8766.0	8766.0	8766.0	8766.0	8766.0	8766.0	8766.0
4	6238.0	2.0	237.0	58.0	108.0	66.0	28.50	...	0.0	8766.0	8766.0	8766.0	8766.0	8766.0	8766.0	8766.0	8766.0

	count	mean	std	min	25%	50%	75%	max
RANDID	11627.0	5.004741e+06	2.900877e+06	2448.00	2474378.000	5006008.00	7472730.00	9999312.0
SEX	11627.0	1.568074e+00	4.953655e-01	1.00	1.000	2.00	2.00	2.0
AGE	11627.0	5.479281e+01	9.564299e+00	32.00	48.000	54.00	62.00	81.0
SYSBP	11627.0	1.363241e+02	2.279862e+01	83.50	120.000	132.00	149.00	295.0
DIABP	11627.0	8.303776e+01	1.166014e+01	30.00	75.000	82.00	90.00	150.0
CURSMOKE	11627.0	4.325277e-01	4.954479e-01	0.00	0.000	0.00	1.00	1.0
CIGPDAY	11548.0	8.250346e+00	1.218689e+01	0.00	0.000	0.00	20.00	90.0
BMI	11575.0	2.587735e+01	4.102640e+00	14.43	23.095	25.48	28.07	56.8
TOTCHOL	11218.0	2.411624e+02	4.536803e+01	107.00	210.000	238.00	268.00	696.0
HDLC	3027.0	4.936472e+01	1.562667e+01	10.00	39.000	48.00	58.00	189.0
LDLC	3026.0	1.764670e+02	4.686339e+01	20.00	145.000	173.00	205.00	565.0
GLUCOSE	10187.0	8.412487e+01	2.499378e+01	39.00	72.000	80.00	89.00	478.0
DIABETES	11627.0	4.558356e-02	2.085892e-01	0.00	0.000	0.00	0.00	1.0
PERIOD	11627.0	1.899286e+00	8.074072e-01	1.00	1.000	2.00	3.00	3.0
DEATH	11627.0	3.033457e-01	4.597230e-01	0.00	0.000	0.00	1.00	1.0
ANYCHD	11627.0	2.716092e-01	4.448086e-01	0.00	0.000	0.00	1.00	1.0
STROKE	11627.0	9.125312e-02	2.879811e-01	0.00	0.000	0.00	0.00	1.0
CVD	11627.0	2.493334e-01	4.326458e-01	0.00	0.000	0.00	0.00	1.0
HYPERTEN	11627.0	7.432700e-01	4.368480e-01	0.00	0.000	1.00	1.00	1.0

Predictive Insights into Heart Health: Investigating the Framingham Heart Study Dataset¶

Getting to know Framingham study¶

Backgroud and Cohort¶

Findings and Impacts¶

Notebook Outline¶

0. BRH Setup and Import Libraries¶

1. Import Data Set and Data Cleaning¶

2. Exploratory Data Analysis¶

Handling Outliers and Missing Values¶

3. Statistical Modeling¶

Linear Regression¶

Logistic Regression¶